[llvm] [AArch64] Expand vector ops when NEON and SVE are unavailable. (PR #90833)

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Thu May 2 09:20:16 PDT 2024


https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/90833

>From 651e42fe0bcb33ae4576401ac50333b9fd8d17d7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 26 Apr 2024 11:04:54 +0100
Subject: [PATCH] [AArch64] Expand vector ops when NEON and SVE are
 unavailable.

Unlike `+noneon` we must assume that vector types are available, i.e. it is
valid to pass/return vector arguments to and from functions. However, the
compiler must make sure to scalarize any vector operations.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    6 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   53 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    2 +
 ...streaming-mode-fixed-length-and-combine.ll |  226 +-
 ...treaming-mode-fixed-length-bit-counting.ll | 2167 +++++++-
 ...sve-streaming-mode-fixed-length-bitcast.ll |   30 +-
 ...e-streaming-mode-fixed-length-bitselect.ll |   32 +-
 .../sve-streaming-mode-fixed-length-concat.ll |  119 +-
 ...e-streaming-mode-fixed-length-ext-loads.ll |  338 +-
 ...ing-mode-fixed-length-extract-subvector.ll |   50 +-
 ...ng-mode-fixed-length-extract-vector-elt.ll |   54 +-
 ...e-streaming-mode-fixed-length-fcopysign.ll |  840 ++-
 ...ve-streaming-mode-fixed-length-fp-arith.ll | 3177 ++++++++---
 ...streaming-mode-fixed-length-fp-compares.ll | 4788 +++++++++--------
 ...-streaming-mode-fixed-length-fp-convert.ll |   29 +-
 ...aming-mode-fixed-length-fp-extend-trunc.ll |  729 ++-
 .../sve-streaming-mode-fixed-length-fp-fma.ll |  569 +-
 ...e-streaming-mode-fixed-length-fp-minmax.ll | 2040 ++++---
 ...eaming-mode-fixed-length-fp-reduce-fa64.ll |   26 +-
 ...e-streaming-mode-fixed-length-fp-reduce.ll | 1438 +++--
 ...streaming-mode-fixed-length-fp-rounding.ll | 2030 ++++++-
 ...e-streaming-mode-fixed-length-fp-select.ll |  305 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 2254 ++++++--
 ...-streaming-mode-fixed-length-fp-vselect.ll |  511 +-
 ...ing-mode-fixed-length-insert-vector-elt.ll |  367 +-
 ...e-streaming-mode-fixed-length-int-arith.ll | 2123 +++++++-
 ...treaming-mode-fixed-length-int-compares.ll | 1048 +++-
 ...sve-streaming-mode-fixed-length-int-div.ll | 2044 +++----
 ...streaming-mode-fixed-length-int-extends.ll | 3716 ++++++++++---
 ...eaming-mode-fixed-length-int-immediates.ll | 3425 +++++++++++-
 ...sve-streaming-mode-fixed-length-int-log.ll | 1503 +++++-
 ...-streaming-mode-fixed-length-int-minmax.ll | 2404 ++++++++-
 ...ing-mode-fixed-length-int-mla-neon-fa64.ll |   47 +-
 ...ve-streaming-mode-fixed-length-int-mulh.ll | 1664 +++++-
 ...-streaming-mode-fixed-length-int-reduce.ll | 1642 +++++-
 ...sve-streaming-mode-fixed-length-int-rem.ll | 2654 ++++-----
 ...-streaming-mode-fixed-length-int-select.ll |  581 +-
 ...-streaming-mode-fixed-length-int-shifts.ll | 1632 +++++-
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 1895 +++++--
 ...streaming-mode-fixed-length-int-vselect.ll |  817 ++-
 ...-streaming-mode-fixed-length-ld2-alloca.ll |  118 +-
 ...reaming-mode-fixed-length-limit-duplane.ll |  145 +-
 .../sve-streaming-mode-fixed-length-loads.ll  |   33 +-
 ...-streaming-mode-fixed-length-log-reduce.ll |  888 ++-
 ...streaming-mode-fixed-length-masked-load.ll | 3314 +++++++++---
 ...treaming-mode-fixed-length-masked-store.ll |  806 ++-
 ...eaming-mode-fixed-length-optimize-ptrue.ll |  937 +++-
 ...streaming-mode-fixed-length-permute-rev.ll |  472 +-
 ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1261 ++++-
 .../sve-streaming-mode-fixed-length-ptest.ll  |  399 +-
 .../sve-streaming-mode-fixed-length-rev.ll    |  936 +++-
 ...e-streaming-mode-fixed-length-sdiv-pow2.ll |  768 ++-
 ...sve-streaming-mode-fixed-length-shuffle.ll |   72 +-
 ...treaming-mode-fixed-length-splat-vector.ll |  245 +-
 .../sve-streaming-mode-fixed-length-stores.ll |   60 +-
 ...e-streaming-mode-fixed-length-subvector.ll |    8 +-
 ...treaming-mode-fixed-length-trunc-stores.ll |   64 +-
 .../sve-streaming-mode-fixed-length-trunc.ll  | 2789 +++++++++-
 ...eaming-mode-fixed-length-vector-shuffle.ll |  339 +-
 .../sve-streaming-mode-test-register-mov.ll   |    6 +-
 60 files changed, 49846 insertions(+), 13189 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bfc3e08c1632de..3175d75eea0860 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3731,8 +3731,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   }
   case ISD::SUB: {
     EVT VT = Node->getValueType(0);
-    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
-           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+    assert((VT.isFixedLengthVector() || // fixed length ADD can be expanded to
+                                        // scalar ADD
+            (TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+             TLI.isOperationLegalOrCustom(ISD::XOR, VT))) &&
            "Don't know how to expand this subtraction!");
     Tmp1 = DAG.getNOT(dl, Node->getOperand(1), VT);
     Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b27d204f3dded0..40cb6164f0094a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -357,7 +357,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
   }
 
-  if (Subtarget->hasNEON()) {
+  if (Subtarget->isNeonAvailable()) {
     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
     // Someone set us up the NEON.
@@ -378,6 +378,27 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
     addQRTypeForNEON(MVT::v8bf16);
+  } else if (Subtarget->hasNEON() || Subtarget->useSVEForFixedLengthVectors()) {
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+
+    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4f16, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4bf16, &AArch64::FPR64RegClass);
+
+    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8f16, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8bf16, &AArch64::FPR128RegClass);
   }
 
   if (Subtarget->hasSVEorSME()) {
@@ -1125,7 +1146,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  if (Subtarget->hasNEON()) {
+  if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
     for (auto Op :
@@ -1328,6 +1349,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // FADDP custom lowering
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
+  } else {
+    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+        setOperationAction(Op, VT, Expand);
+
+      if (VT.is128BitVector() || VT.is64BitVector()) {
+        setOperationAction(ISD::LOAD, VT, Legal);
+        setOperationAction(ISD::STORE, VT, Legal);
+        setOperationAction(ISD::BITCAST, VT,
+                           Subtarget->isLittleEndian() ? Legal : Expand);
+      }
+      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
   }
 
   if (Subtarget->hasSME()) {
@@ -9377,7 +9416,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
-  if (!Subtarget->hasNEON())
+  if (!Subtarget->isNeonAvailable() &&
+      !Subtarget->useSVEForFixedLengthVectors())
     return SDValue();
 
   EVT VT = Op.getValueType();
@@ -14110,6 +14150,13 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
 }
 
+bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
+    EVT VT, unsigned DefinedValues) const {
+  if (!Subtarget->isNeonAvailable())
+    return false;
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   // Currently no fixed length shuffles that require SVE are legal.
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fbdc4de5617fe9..5a402b8df099f0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1020,6 +1020,8 @@ class AArch64TargetLowering : public TargetLowering {
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
+  bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
+
   unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
                                   SelectionDAG &DAG) const;
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index fd9259048df543..4c3188fd7b2381 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -18,8 +18,15 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff000000ff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
@@ -37,8 +44,21 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
@@ -56,8 +76,33 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
@@ -78,9 +123,57 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
@@ -102,9 +195,11 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
@@ -122,8 +217,15 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
@@ -141,8 +243,21 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
@@ -163,9 +278,33 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
@@ -183,9 +322,11 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
@@ -203,8 +344,13 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
@@ -225,9 +371,17 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
@@ -245,7 +399,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
@@ -265,8 +423,16 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
-; NONEON-NOSVE-NEXT:    mov v1.d[0], xzr
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index 8f0378252a54ef..3a71207db4153d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -22,12 +22,26 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    mov w8, #8 // =0x8
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    clz w10, w10
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    sub w9, w9, #24
+; NONEON-NOSVE-NEXT:    sub w10, w10, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w11
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -44,7 +58,42 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -61,7 +110,74 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -79,10 +195,140 @@ define void @ctlz_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    clz v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
@@ -103,12 +349,17 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    sub w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -125,7 +376,26 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -142,7 +412,42 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -160,10 +465,76 @@ define void @ctlz_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
@@ -182,7 +553,15 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -199,7 +578,20 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -217,10 +609,32 @@ define void @ctlz_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
@@ -239,23 +653,13 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #1
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #2
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #4
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #8
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #16
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #32
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    mvn v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -272,23 +676,15 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -306,42 +702,22 @@ define void @ctlz_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
@@ -365,10 +741,37 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d2, x10
+; NONEON-NOSVE-NEXT:    fmov d3, x8
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v2.8b, v2.8b
+; NONEON-NOSVE-NEXT:    cnt v3.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h2, v2.8b
+; NONEON-NOSVE-NEXT:    uaddlv h3, v3.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -385,7 +788,67 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    str d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #134]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #143]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #141]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #139]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -402,7 +865,126 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #287]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #285]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -420,10 +1002,240 @@ define void @ctpop_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #576
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 592
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #543]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #542]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #541]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #539]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #538]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #537]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #535]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #534]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #533]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #531]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #530]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #529]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #527]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #525]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #523]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #521]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #519]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #517]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #515]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #513]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #575]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #573]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #571]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #569]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #567]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #565]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #563]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #561]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #559]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #557]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #555]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #553]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #549]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #547]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #545]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #544]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #576
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
@@ -443,11 +1255,23 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -464,8 +1288,39 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -482,8 +1337,67 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -501,12 +1415,128 @@ define void @ctpop_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #288]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
@@ -525,9 +1555,24 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -544,9 +1589,37 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -564,14 +1637,65 @@ define void @ctpop_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #160]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
@@ -590,10 +1714,15 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -610,10 +1739,23 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -631,16 +1773,37 @@ define void @ctpop_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
@@ -665,17 +1828,30 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #256 // =0x100
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v2.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -693,10 +1869,50 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8b, #1
-; NONEON-NOSVE-NEXT:    sub v1.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -714,10 +1930,90 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.16b, #1
-; NONEON-NOSVE-NEXT:    sub v1.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -737,15 +2033,172 @@ define void @cttz_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v3.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
@@ -766,17 +2219,19 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #65536 // =0x10000
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v2.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -794,14 +2249,30 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -819,14 +2290,50 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -846,20 +2353,92 @@ define void @cttz_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v3.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
@@ -879,14 +2458,17 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -904,14 +2486,24 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -931,20 +2523,40 @@ define void @cttz_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v3.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
@@ -964,14 +2576,14 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sub d1, d0, d1
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -989,14 +2601,17 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -1016,22 +2631,26 @@ define void @cttz_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 64dc7ae117d3a9..48c818f867870d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -15,8 +15,14 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #3]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w10, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
@@ -102,12 +108,22 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 5e06cd62118d7a..d0ee85c1b9e127 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -34,13 +34,39 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ;
 ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
 ; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index ee997228e4532b..07057dfbdf7607 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -44,7 +44,27 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -62,9 +82,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -152,7 +172,17 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
@@ -171,9 +201,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -243,7 +273,14 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
@@ -262,9 +299,9 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
@@ -332,9 +369,9 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
@@ -407,7 +444,14 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
@@ -425,9 +469,9 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
@@ -497,7 +541,14 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
@@ -516,9 +567,9 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -586,9 +637,9 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
@@ -732,7 +783,11 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -755,7 +810,11 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -775,7 +834,11 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v8i32_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -794,7 +857,11 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v4i64_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 42aa67fb2ab8b4..45a3e791c3ff09 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -15,8 +15,28 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
@@ -33,8 +53,18 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
@@ -51,8 +81,15 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
@@ -77,13 +114,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x0, x4, [sp], #16
 ; NONEON-NOSVE-NEXT:    mov x1, xzr
 ; NONEON-NOSVE-NEXT:    mov x2, xzr
 ; NONEON-NOSVE-NEXT:    mov x3, xzr
 ; NONEON-NOSVE-NEXT:    mov x5, xzr
 ; NONEON-NOSVE-NEXT:    mov x6, xzr
-; NONEON-NOSVE-NEXT:    mov x4, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
 ; NONEON-NOSVE-NEXT:    mov x7, xzr
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
@@ -110,20 +148,75 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
@@ -144,12 +237,24 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
@@ -186,34 +291,31 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x10, x8, #32
-; NONEON-NOSVE-NEXT:    add x11, x8, #96
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    st1 { v0.d }[1], [x10]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    st1 { v1.d }[1], [x11]
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    asr x10, x10, #63
-; NONEON-NOSVE-NEXT:    str d0, [x8]
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
-; NONEON-NOSVE-NEXT:    str d1, [x8, #64]
-; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #40]
-; NONEON-NOSVE-NEXT:    fmov x9, d1
-; NONEON-NOSVE-NEXT:    str x10, [x8, #8]
-; NONEON-NOSVE-NEXT:    asr x10, x11, #63
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x11, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x12, x13, [sp, #80]
+; NONEON-NOSVE-NEXT:    asr x10, x9, #63
+; NONEON-NOSVE-NEXT:    asr x14, x11, #63
 ; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
-; NONEON-NOSVE-NEXT:    str x10, [x8, #104]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #80]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #72]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp x9, x10, [x8, #96]
+; NONEON-NOSVE-NEXT:    asr x9, x13, #63
+; NONEON-NOSVE-NEXT:    asr x10, x12, #63
+; NONEON-NOSVE-NEXT:    stp x14, x14, [x8, #80]
+; NONEON-NOSVE-NEXT:    stp x11, x14, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
+; NONEON-NOSVE-NEXT:    stp x13, x9, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
+; NONEON-NOSVE-NEXT:    stp x12, x10, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
@@ -251,18 +353,26 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    dup v1.2d, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    asr x1, x0, #63
-; NONEON-NOSVE-NEXT:    asr x5, x8, #63
-; NONEON-NOSVE-NEXT:    mov x2, x1
-; NONEON-NOSVE-NEXT:    mov x3, x1
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x5
-; NONEON-NOSVE-NEXT:    mov x6, x5
-; NONEON-NOSVE-NEXT:    mov x7, x5
-; NONEON-NOSVE-NEXT:    fmov x4, d1
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    asr x8, x10, #63
+; NONEON-NOSVE-NEXT:    stp x9, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x10, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x0, x1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x2, x3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp x4, x5, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp x6, x7, [sp, #112]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
@@ -300,30 +410,88 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #256]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index d050ddc77640ef..bcdc03d021c947 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -31,7 +31,18 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
@@ -63,7 +74,18 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
@@ -178,8 +200,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
@@ -275,8 +301,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
@@ -331,8 +361,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index b2cf818e6e3c73..ebd75d54b1451a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -19,8 +19,11 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
@@ -36,8 +39,11 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
@@ -53,7 +59,10 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
@@ -69,7 +78,11 @@ define half @extractelement_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
@@ -86,8 +99,11 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
@@ -103,7 +119,10 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
@@ -119,7 +138,11 @@ define float @extractelement_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
@@ -147,7 +170,10 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d0, v0.d[1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
@@ -163,7 +189,11 @@ define double @extractelement_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index bed5dd53c519b8..f7f42cda93e6c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -32,12 +32,58 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.4h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
@@ -68,12 +114,102 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
@@ -108,13 +244,191 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
@@ -147,12 +461,26 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
@@ -183,12 +511,37 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -223,13 +576,63 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
@@ -262,12 +665,25 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -302,13 +718,39 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -347,13 +789,27 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -402,14 +858,39 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -447,13 +928,27 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
@@ -502,19 +997,41 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -554,13 +1071,49 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -606,15 +1159,51 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    str d2, [sp]
 ; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fcvtxn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -662,14 +1251,83 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index 662a8f2b55fdd8..d63d878272a6eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -21,10 +21,39 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -42,10 +71,39 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -63,14 +121,66 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -90,25 +200,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -129,7 +341,17 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -147,7 +369,22 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -167,11 +404,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -192,7 +457,16 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -212,11 +486,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -241,10 +531,39 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -262,10 +581,39 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -283,14 +631,66 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -310,26 +710,127 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v5.4s, v4.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v4.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldr q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl2 v6.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fdiv v3.4s, v3.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v5.4s, v6.4s, v5.4s
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -350,7 +851,17 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -368,7 +879,22 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -388,11 +914,39 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -413,7 +967,16 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -433,11 +996,27 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fdiv v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -463,42 +1042,48 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
@@ -517,42 +1102,48 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
@@ -571,75 +1162,84 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    fcvt s16, h18
-; NONEON-NOSVE-NEXT:    fcvt s17, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s4, s5, s4, s3
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmadd s6, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s7, h18
-; NONEON-NOSVE-NEXT:    fcvt s16, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov v3.h[1], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    fmadd s5, s16, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov v3.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fmadd s17, s19, s18, s17
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s4, s16, s7, s4
-; NONEON-NOSVE-NEXT:    mov v3.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h18
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fmadd s5, s7, s6, s5
-; NONEON-NOSVE-NEXT:    mov v3.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v3.h[5], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h4, s5
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v3.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov v3.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
@@ -660,146 +1260,161 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q5, q2, [x2]
-; NONEON-NOSVE-NEXT:    mov h25, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s18, h1
-; NONEON-NOSVE-NEXT:    mov h22, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    mov h20, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[1]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[1]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s25, h25
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h29, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s23, h17
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s21, h16
-; NONEON-NOSVE-NEXT:    fmadd s6, s19, s18, s6
-; NONEON-NOSVE-NEXT:    fcvt s18, h20
-; NONEON-NOSVE-NEXT:    fcvt s19, h22
-; NONEON-NOSVE-NEXT:    fcvt s20, h24
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s22, h5
-; NONEON-NOSVE-NEXT:    fcvt s24, h4
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fmadd s21, s25, s23, s21
-; NONEON-NOSVE-NEXT:    fcvt s23, h3
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[2]
-; NONEON-NOSVE-NEXT:    fmadd s18, s20, s19, s18
-; NONEON-NOSVE-NEXT:    mov h19, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[3]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s22, s23, s24, s22
-; NONEON-NOSVE-NEXT:    fcvt h20, s21
-; NONEON-NOSVE-NEXT:    mov h21, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt s24, h29
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fmadd s16, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt h26, s26
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[1], v20.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s17, h21
-; NONEON-NOSVE-NEXT:    fcvt s20, h30
-; NONEON-NOSVE-NEXT:    fmadd s19, s19, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s21, h31
-; NONEON-NOSVE-NEXT:    fcvt h7, s22
-; NONEON-NOSVE-NEXT:    fcvt s22, h25
-; NONEON-NOSVE-NEXT:    fcvt s23, h27
-; NONEON-NOSVE-NEXT:    fcvt s24, h28
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[4]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[4]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s17, s21, s20, s17
-; NONEON-NOSVE-NEXT:    mov v7.h[1], v26.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s24, s23, s22
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s20, h25
-; NONEON-NOSVE-NEXT:    fcvt s21, h27
-; NONEON-NOSVE-NEXT:    fcvt s22, h28
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[5]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s23, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov v7.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov h20, v5.h[6]
-; NONEON-NOSVE-NEXT:    mov h21, v4.h[6]
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fmadd s23, s25, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    mov v6.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
-; NONEON-NOSVE-NEXT:    fcvt s17, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov v7.h[3], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    mov h5, v5.h[7]
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[7]
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s17, s25, s24, s17
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov v6.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s23
-; NONEON-NOSVE-NEXT:    mov v7.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s26
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #56]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v6.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    mov v7.h[5], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s3, s3, s4, s5
-; NONEON-NOSVE-NEXT:    fcvt h4, s19
-; NONEON-NOSVE-NEXT:    fcvt h5, s17
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v7.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v6.h[6], v5.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v6.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -822,8 +1437,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
@@ -842,8 +1468,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
@@ -864,12 +1508,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -892,8 +1569,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
@@ -914,12 +1602,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -945,10 +1652,39 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -966,10 +1702,39 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -987,14 +1752,66 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -1014,25 +1831,127 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1053,7 +1972,17 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -1071,7 +2000,22 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -1091,11 +2035,39 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1116,7 +2088,16 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -1136,11 +2117,27 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmul v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1164,8 +2161,30 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
@@ -1182,8 +2201,30 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
@@ -1200,8 +2241,50 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
@@ -1219,11 +2302,92 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
@@ -1242,7 +2406,15 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
@@ -1259,7 +2431,20 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
@@ -1277,10 +2462,32 @@ define void @fneg_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fneg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
@@ -1299,7 +2506,15 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
@@ -1317,10 +2532,22 @@ define void @fneg_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fneg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
@@ -1343,26 +2570,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1379,26 +2610,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1415,44 +2650,50 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h1, s5
-; NONEON-NOSVE-NEXT:    mov v0.h[4], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h1, s6
-; NONEON-NOSVE-NEXT:    mov v0.h[5], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h1, s7
-; NONEON-NOSVE-NEXT:    mov v0.h[6], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s2, s16
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1470,85 +2711,92 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q16, [x0]
-; NONEON-NOSVE-NEXT:    mov h0, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h17, v16.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s18, h16
-; NONEON-NOSVE-NEXT:    mov h19, v16.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h20, v16.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h21, v16.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h22, v16.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov h23, v16.h[6]
-; NONEON-NOSVE-NEXT:    mov h16, v16.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s23, h23
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fsqrt s0, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s17, s17
-; NONEON-NOSVE-NEXT:    fcvt h17, s17
-; NONEON-NOSVE-NEXT:    fsqrt s18, s18
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    mov v18.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h0, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s19, s19
-; NONEON-NOSVE-NEXT:    fcvt h17, s19
-; NONEON-NOSVE-NEXT:    mov v18.h[2], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s20, s20
-; NONEON-NOSVE-NEXT:    fcvt h3, s20
-; NONEON-NOSVE-NEXT:    mov v18.h[3], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s21, s21
-; NONEON-NOSVE-NEXT:    fcvt h3, s21
-; NONEON-NOSVE-NEXT:    mov v18.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s22, s22
-; NONEON-NOSVE-NEXT:    fcvt h3, s22
-; NONEON-NOSVE-NEXT:    mov v18.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h0, s7
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s23, s23
-; NONEON-NOSVE-NEXT:    fcvt h3, s23
-; NONEON-NOSVE-NEXT:    mov v18.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s16, s16
-; NONEON-NOSVE-NEXT:    fcvt h3, s16
-; NONEON-NOSVE-NEXT:    mov v18.h[7], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q18, q2, [x0]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
@@ -1567,7 +2815,15 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1584,7 +2840,20 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1602,10 +2871,32 @@ define void @fsqrt_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsqrt v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
@@ -1624,7 +2915,15 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1642,10 +2941,22 @@ define void @fsqrt_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsqrt v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
@@ -1669,10 +2980,39 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -1690,10 +3030,39 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -1711,14 +3080,66 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -1738,25 +3159,127 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fsub v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fsub v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1777,7 +3300,17 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -1795,7 +3328,22 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -1815,11 +3363,39 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1840,7 +3416,16 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -1860,11 +3445,27 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1888,7 +3489,30 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1905,7 +3529,30 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1922,7 +3569,50 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1940,10 +3630,92 @@ define void @fabs_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    bic v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
@@ -1962,7 +3734,15 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1979,7 +3759,20 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1997,10 +3790,32 @@ define void @fabs_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fabs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
@@ -2019,7 +3834,15 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -2037,10 +3860,22 @@ define void @fabs_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fabs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index d4810c78cb53dc..1ad0d7c2c691be 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -23,10 +23,24 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
@@ -46,10 +60,39 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
@@ -69,61 +112,66 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s2, s5
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
@@ -145,119 +193,127 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -280,7 +336,18 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
@@ -300,7 +367,24 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -322,11 +406,43 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -347,7 +463,13 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcmp d0, d1
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
@@ -367,7 +489,17 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -389,11 +521,29 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -426,135 +576,143 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
 ; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -587,150 +745,158 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
 ; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp one <16 x half> %op1, %op2
-  %sext = sext <16 x i1> %cmp to <16 x i16>
-  store <16 x i16> %sext, ptr %c
-  ret void
-}
-
-;
-; FCMP UNE
-;
-
-define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: fcmp_une_v16f16:
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp one <16 x half> %op1, %op2
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  store <16 x i16> %sext, ptr %c
+  ret void
+}
+
+;
+; FCMP UNE
+;
+
+define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: fcmp_une_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
@@ -744,119 +910,127 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -885,119 +1059,127 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1029,119 +1211,127 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, hi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, hi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, hi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1170,123 +1360,131 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, mi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp olt <16 x half> %op1, %op2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp olt <16 x half> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i16>
   store <16 x i16> %sext, ptr %c
   ret void
@@ -1314,119 +1512,127 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1455,119 +1661,127 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1599,119 +1813,127 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, pl
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, pl
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, pl
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1740,263 +1962,279 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ls
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ls
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ls
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp ole <16 x half> %op1, %op2
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  store <16 x i16> %sext, ptr %c
+  ret void
+}
+
+;
+; FCMP ULE
+;
+
+define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: fcmp_ule_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-NEXT:    eor z0.d, z2.d, z0.d
+; CHECK-NEXT:    stp q1, q0, [x2]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp ole <16 x half> %op1, %op2
-  %sext = sext <16 x i1> %cmp to <16 x i16>
-  store <16 x i16> %sext, ptr %c
-  ret void
-}
-
-;
-; FCMP ULE
-;
-
-define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: fcmp_ule_v16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-NEXT:    eor z0.d, z2.d, z0.d
-; CHECK-NEXT:    stp q1, q0, [x2]
-; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2025,119 +2263,127 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vs
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vs
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vs
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2169,119 +2415,127 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vc
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2310,119 +2564,127 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2451,119 +2713,127 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2592,119 +2862,127 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2733,119 +3011,127 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2874,119 +3160,127 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -3015,119 +3309,127 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index ac0b6c0e0440ce..9f362fd667309a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -21,13 +21,28 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #8.00000000
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s2, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w11, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s3, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w12, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s2, #3
+; NONEON-NOSVE-NEXT:    stp w11, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s3, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s0, #3
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w10, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 16f30adbd14e02..f3a81cf8f701dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -21,8 +21,16 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
@@ -41,8 +49,22 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
@@ -64,13 +86,33 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
@@ -99,17 +141,57 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
@@ -132,9 +214,20 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
@@ -153,9 +246,23 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
@@ -178,13 +285,33 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
@@ -214,17 +341,57 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
@@ -246,9 +413,14 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldr h0, [x0]
 ; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
@@ -267,10 +439,26 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
@@ -292,15 +480,35 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
@@ -329,22 +537,61 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
@@ -390,34 +637,115 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    fcvtl v5.2d, v5.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v7.2s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v6.2s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
@@ -440,7 +768,7 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
@@ -460,9 +788,18 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
@@ -485,13 +822,23 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
@@ -521,17 +868,37 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
@@ -554,9 +921,21 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
@@ -576,8 +955,23 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
@@ -599,10 +993,36 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
@@ -646,10 +1066,22 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
@@ -671,11 +1103,27 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtxn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
@@ -698,8 +1146,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
 ; NONEON-NOSVE-NEXT:    str s0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
@@ -718,8 +1165,16 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -740,10 +1195,22 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 44d7116e5f8713..32fc6eb54871df 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -21,14 +21,59 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fmul s1, s3, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmul s2, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmul s2, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
@@ -48,22 +93,107 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s3, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    fcvt s23, h23
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmul s5, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmul s3, s4, s3
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s23, s22
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    str h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s17, s16
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    str h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
@@ -85,42 +215,228 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x2]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp d15, d14, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d13, d12, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d11, d10, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d9, d8, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset b8, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset b9, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset b10, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset b11, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset b12, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset b13, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset b14, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset b15, -64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q19, [x2]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h15, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s20, h0
+; NONEON-NOSVE-NEXT:    fcvt s21, h1
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    ldr h13, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h14, [sp, #74]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h12, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h10, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmul s30, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h31, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h19, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h15
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fmul s0, s0, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h14
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s16, s17, s16
+; NONEON-NOSVE-NEXT:    fmul s6, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s18, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s19, h13
+; NONEON-NOSVE-NEXT:    fmul s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldp d15, d14, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fmul s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14] // 2-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    str h18, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fmul s1, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s18
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h11
+; NONEON-NOSVE-NEXT:    fcvt s30, h12
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    ldp d13, d12, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h9
+; NONEON-NOSVE-NEXT:    fcvt s30, h10
+; NONEON-NOSVE-NEXT:    ldp d11, d10, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h31
+; NONEON-NOSVE-NEXT:    fcvt s30, h8
+; NONEON-NOSVE-NEXT:    ldp d9, d8, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h28
+; NONEON-NOSVE-NEXT:    fcvt s28, h29
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s28, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h26
+; NONEON-NOSVE-NEXT:    fcvt s26, h27
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s26, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    fcvt s24, h25
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s24, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s22, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h20
+; NONEON-NOSVE-NEXT:    fcvt s20, h21
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s20, s19
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s16, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s6, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -144,8 +460,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
@@ -165,8 +492,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
@@ -188,12 +533,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -212,7 +590,12 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
@@ -232,8 +615,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
@@ -255,12 +649,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index bc7659c06ad05f..00d18b83a5d0cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -21,34 +21,39 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -66,60 +71,66 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -139,115 +150,127 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -268,7 +291,17 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -286,7 +319,22 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -306,11 +354,39 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -327,7 +403,12 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -345,7 +426,16 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -365,11 +455,27 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -394,34 +500,39 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -439,60 +550,66 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fminnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fminnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fminnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -512,115 +629,127 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fminnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fminnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fminnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fminnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fminnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -641,7 +770,17 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -659,7 +798,22 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -679,11 +833,39 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -700,7 +882,12 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -718,7 +905,16 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -738,11 +934,27 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -767,34 +979,39 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -812,60 +1029,66 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmax s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmax s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmax s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmax s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -885,115 +1108,127 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmax s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmax s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmax s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmax s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmax s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmax s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmax s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmax s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmax s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmax s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmax s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1014,7 +1249,17 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -1032,7 +1277,22 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -1052,11 +1312,39 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1073,7 +1361,12 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -1091,7 +1384,16 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -1111,11 +1413,27 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmax v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1140,34 +1458,39 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -1185,60 +1508,66 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmin s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmin s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmin s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmin s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -1258,115 +1587,127 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmin s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmin s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmin s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmin s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmin s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmin s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmin s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmin s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmin s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmin s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmin s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1387,7 +1728,17 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -1405,7 +1756,22 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -1425,11 +1791,39 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1446,7 +1840,12 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -1464,7 +1863,16 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -1484,11 +1892,27 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmin v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index fdb81b8e5fe1b6..cf58e5b1b37275 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -30,26 +30,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 74a5db4b38e013..09706004796465 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -23,26 +23,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
@@ -71,45 +75,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
@@ -154,86 +162,93 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
@@ -251,10 +266,13 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
@@ -275,13 +293,15 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
@@ -310,22 +330,25 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
@@ -357,9 +380,11 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
@@ -380,13 +405,19 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov d2, v3.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d3
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
@@ -408,26 +439,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
@@ -444,45 +479,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
@@ -500,54 +539,90 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    mov h1, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h2
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
@@ -565,8 +640,13 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
@@ -583,8 +663,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
@@ -604,10 +689,21 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; NONEON-NOSVE-LABEL: faddv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp]
+; NONEON-NOSVE-NEXT:    ldp s5, s6, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s7, s16, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s7, s5
+; NONEON-NOSVE-NEXT:    fadd s4, s16, s6
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
@@ -639,7 +735,10 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
@@ -659,8 +758,13 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; NONEON-NOSVE-LABEL: faddv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v1.2d
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d4, d3, [sp], #32
+; NONEON-NOSVE-NEXT:    fadd d1, d3, d1
+; NONEON-NOSVE-NEXT:    fadd d2, d4, d2
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
@@ -683,22 +787,26 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
@@ -715,41 +823,45 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
@@ -767,81 +879,86 @@ define half @fmaxv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
@@ -859,7 +976,12 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
@@ -876,7 +998,14 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
@@ -895,8 +1024,20 @@ define float @fmaxv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaxv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
@@ -926,7 +1067,10 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
@@ -945,8 +1089,13 @@ define double @fmaxv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaxv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmaxnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
@@ -968,22 +1117,26 @@ define half @fminv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
@@ -1000,41 +1153,45 @@ define half @fminv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
@@ -1052,81 +1209,86 @@ define half @fminv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
@@ -1144,7 +1306,12 @@ define float @fminv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
@@ -1161,7 +1328,14 @@ define float @fminv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
@@ -1180,8 +1354,20 @@ define float @fminv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
@@ -1211,7 +1397,10 @@ define double @fminv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
@@ -1230,8 +1419,13 @@ define double @fminv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fminnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
@@ -1253,22 +1447,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
@@ -1285,41 +1483,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
@@ -1337,81 +1539,86 @@ define half @fmaximumv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s4
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
@@ -1429,7 +1636,12 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
@@ -1446,7 +1658,14 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
@@ -1465,8 +1684,20 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
@@ -1496,7 +1727,10 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
@@ -1515,8 +1749,13 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmax d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmax d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
@@ -1538,22 +1777,26 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
@@ -1570,41 +1813,45 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
@@ -1622,81 +1869,86 @@ define half @fminimumv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s4
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
@@ -1714,7 +1966,12 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
@@ -1731,7 +1988,14 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
@@ -1750,8 +2014,20 @@ define float @fminimumv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminimumv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
@@ -1781,7 +2057,10 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
@@ -1800,8 +2079,13 @@ define double @fminimumv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminimumv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmin d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmin d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 454683865eb9a9..144c20693e972c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -20,9 +20,30 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -39,9 +60,30 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -58,12 +100,50 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -81,20 +161,92 @@ define void @frintp_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintp v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintp v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
@@ -113,7 +265,15 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -130,7 +290,20 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -148,10 +321,32 @@ define void @frintp_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
@@ -167,7 +362,12 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -184,7 +384,15 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -202,10 +410,22 @@ define void @frintp_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintp v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
@@ -228,9 +448,30 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -247,9 +488,30 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -266,12 +528,50 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -289,20 +589,92 @@ define void @frintm_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintm v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintm v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
@@ -321,7 +693,15 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -338,7 +718,20 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -356,10 +749,32 @@ define void @frintm_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
@@ -375,7 +790,12 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -392,7 +812,15 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -410,10 +838,22 @@ define void @frintm_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintm v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
@@ -436,9 +876,30 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -455,9 +916,30 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -474,12 +956,50 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -497,20 +1017,92 @@ define void @frinti_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinti v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinti v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
@@ -529,7 +1121,15 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -546,7 +1146,20 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -564,10 +1177,32 @@ define void @frinti_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
@@ -583,7 +1218,12 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -600,7 +1240,15 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -618,10 +1266,22 @@ define void @frinti_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinti v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
@@ -644,9 +1304,30 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -663,9 +1344,30 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -682,12 +1384,50 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -705,20 +1445,92 @@ define void @frintx_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintx v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintx v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
@@ -737,7 +1549,15 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -754,7 +1574,20 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -772,10 +1605,32 @@ define void @frintx_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
@@ -791,7 +1646,12 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -808,7 +1668,15 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -826,10 +1694,22 @@ define void @frintx_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintx v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
@@ -852,9 +1732,30 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -871,9 +1772,30 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -890,12 +1812,50 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -913,20 +1873,92 @@ define void @frinta_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinta v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinta v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
@@ -945,7 +1977,15 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -962,7 +2002,20 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -980,10 +2033,32 @@ define void @frinta_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
@@ -999,7 +2074,12 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1016,7 +2096,15 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1034,10 +2122,22 @@ define void @frinta_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinta v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
@@ -1060,9 +2160,30 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1079,9 +2200,30 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1098,12 +2240,50 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1121,20 +2301,92 @@ define void @frintn_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintn v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintn v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
@@ -1153,7 +2405,15 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1170,7 +2430,20 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1188,10 +2461,32 @@ define void @frintn_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
@@ -1207,7 +2502,12 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1224,7 +2524,15 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1242,10 +2550,22 @@ define void @frintn_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintn v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
@@ -1268,9 +2588,30 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1287,9 +2628,30 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1306,12 +2668,50 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1329,20 +2729,92 @@ define void @frintz_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintz v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintz v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
@@ -1361,7 +2833,15 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1378,7 +2858,20 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1396,10 +2889,32 @@ define void @frintz_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
@@ -1415,7 +2930,12 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1432,7 +2952,15 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1450,10 +2978,22 @@ define void @frintz_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintz v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 0268dd1b5d318f..860c14481c1b9c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -20,10 +20,28 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
@@ -44,10 +62,28 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
@@ -68,10 +104,43 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
@@ -95,16 +164,83 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
@@ -128,10 +264,18 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
@@ -152,10 +296,23 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
@@ -179,16 +336,43 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
@@ -206,10 +390,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
@@ -231,10 +418,17 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
@@ -259,16 +453,31 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 1c63a3870d682f..fdd7bc1c5676b7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -19,9 +19,26 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -39,16 +56,43 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
@@ -69,22 +113,75 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
@@ -108,9 +205,17 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -128,8 +233,25 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -151,15 +273,41 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
@@ -189,21 +337,73 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
@@ -224,9 +424,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -246,14 +450,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -280,23 +488,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
@@ -339,42 +551,43 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
@@ -439,76 +652,79 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzu x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
@@ -531,7 +747,14 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -549,8 +772,20 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -572,10 +807,31 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
@@ -604,15 +860,56 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
@@ -635,7 +932,14 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -652,7 +956,18 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -670,10 +985,28 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
@@ -697,9 +1030,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -717,8 +1054,15 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -740,15 +1084,21 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
@@ -778,21 +1128,33 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
@@ -814,8 +1176,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -833,8 +1199,14 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -867,11 +1239,27 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
@@ -919,19 +1307,49 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
@@ -1012,31 +1430,90 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
@@ -1060,9 +1537,12 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -1080,8 +1560,14 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1103,10 +1589,19 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
@@ -1135,15 +1630,32 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
@@ -1166,8 +1678,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1184,7 +1700,14 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1202,10 +1725,20 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
@@ -1228,9 +1761,26 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1248,16 +1798,43 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
@@ -1278,22 +1855,75 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
@@ -1317,9 +1947,17 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1337,8 +1975,25 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1360,15 +2015,41 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
@@ -1398,21 +2079,73 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
@@ -1433,9 +2166,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1456,14 +2193,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1490,23 +2231,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
@@ -1549,42 +2294,43 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
@@ -1649,76 +2395,79 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzs x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
@@ -1741,7 +2490,14 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -1759,8 +2515,20 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1782,10 +2550,31 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
@@ -1814,15 +2603,56 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
@@ -1845,7 +2675,14 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1862,7 +2699,18 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1880,10 +2728,28 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
@@ -1907,9 +2773,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1927,8 +2797,15 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1950,15 +2827,21 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
@@ -1988,21 +2871,33 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
@@ -2026,8 +2921,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -2045,8 +2944,14 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -2079,11 +2984,27 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
@@ -2131,19 +3052,49 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI61_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI61_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
@@ -2224,31 +3175,90 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI62_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI62_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
@@ -2272,9 +3282,12 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -2292,8 +3305,14 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -2315,10 +3334,19 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
@@ -2347,15 +3375,32 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
@@ -2378,8 +3423,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -2396,7 +3445,14 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -2414,10 +3470,20 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 32fe74bbb65f47..ec5965e9069262 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -31,10 +31,27 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
@@ -57,9 +74,40 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
@@ -83,10 +131,68 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
@@ -107,122 +213,126 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s6, h4
+; NONEON-NOSVE-NEXT:    fcvt s7, h5
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s18, h17
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s2, h16
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
 ; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[3]
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcvt s7, h3
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s1, s5, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s18, h21
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #28]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcsel s2, s17, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s20, s7
+; NONEON-NOSVE-NEXT:    fcvt s16, h5
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcsel s3, s19, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s4
+; NONEON-NOSVE-NEXT:    fcvt s19, h7
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcsel s4, s21, s6, eq
 ; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h2
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fmov s4, w14
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s16, h17
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w8
 ; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fmov s5, w14
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s16, s7
-; NONEON-NOSVE-NEXT:    mov h7, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w12
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w16
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s17
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w11
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w13
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h3, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s5, s22, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s19
+; NONEON-NOSVE-NEXT:    fcvt s22, h16
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h4, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcsel s6, s20, s7, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s17
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    fcvt s21, h26
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h5, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcsel s7, s24, s18, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s22, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h27
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h6, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcsel s16, s25, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h18
+; NONEON-NOSVE-NEXT:    fcvt s25, h24
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h7, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s19, s26, s19, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    fcvt s26, h28
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h16, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcsel s17, s27, s17, eq
+; NONEON-NOSVE-NEXT:    fcmp s25, s21
+; NONEON-NOSVE-NEXT:    fcvt s25, h22
+; NONEON-NOSVE-NEXT:    fcvt s27, h29
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h19, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcsel s18, s24, s18, eq
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    fcvt s23, h21
+; NONEON-NOSVE-NEXT:    str h17, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s26, h24
+; NONEON-NOSVE-NEXT:    fcsel s20, s28, s20, eq
+; NONEON-NOSVE-NEXT:    fcmp s27, s25
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp]
+; NONEON-NOSVE-NEXT:    str h18, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt s17, h25
+; NONEON-NOSVE-NEXT:    fcvt s18, h27
+; NONEON-NOSVE-NEXT:    fcsel s7, s29, s22, eq
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    str h20, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcsel s16, s24, s21, eq
+; NONEON-NOSVE-NEXT:    str h7, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s18, s17
+; NONEON-NOSVE-NEXT:    str h16, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcsel s2, s27, s25, eq
+; NONEON-NOSVE-NEXT:    str h2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -249,9 +359,22 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ;
 ; NONEON-NOSVE-LABEL: select_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
@@ -275,10 +398,36 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ;
 ; NONEON-NOSVE-LABEL: select_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
@@ -299,14 +448,45 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr s17, [sp]
+; NONEON-NOSVE-NEXT:    ldp s6, s7, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    ldp s1, s5, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s2, s3, s2, eq
+; NONEON-NOSVE-NEXT:    ldp s16, s3, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s4, s1
+; NONEON-NOSVE-NEXT:    fcsel s1, s4, s1, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcsel s3, s5, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    ldr s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s4, s6, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    ldr s6, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcsel s5, s7, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s16, s6
+; NONEON-NOSVE-NEXT:    ldr s7, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s3, s4, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s6, s16, s6, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s7
+; NONEON-NOSVE-NEXT:    fcsel s3, s17, s7, eq
+; NONEON-NOSVE-NEXT:    stp s5, s6, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp s3, s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -325,10 +505,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ;
 ; NONEON-NOSVE-LABEL: select_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
@@ -352,10 +535,23 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ;
 ; NONEON-NOSVE-LABEL: select_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    sbfx x8, x9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
@@ -376,14 +572,29 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d5, d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d4, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, eq
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcsel d2, d3, d2, eq
+; NONEON-NOSVE-NEXT:    fcmp d4, d1
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d1, d4, d1, eq
+; NONEON-NOSVE-NEXT:    fcmp d5, d3
+; NONEON-NOSVE-NEXT:    fcsel d3, d5, d3, eq
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d3, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index c85048ab72e038..006b17ee9babce 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -25,10 +25,21 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
@@ -50,10 +61,23 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
@@ -75,8 +99,25 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
@@ -98,8 +139,25 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w8
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
@@ -122,10 +180,18 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
@@ -147,10 +213,21 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
@@ -172,8 +249,23 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[7], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
@@ -195,8 +287,23 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.h[7], w8
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
@@ -219,10 +326,18 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
@@ -244,8 +359,20 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
@@ -267,9 +394,20 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
@@ -286,8 +424,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
@@ -309,8 +451,18 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
@@ -332,9 +484,18 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x8
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
@@ -358,11 +519,14 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; NONEON-NOSVE-LABEL: insertelement_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI14_0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ld1r { v1.4h }, [x8]
-; NONEON-NOSVE-NEXT:    mov v1.h[0], v0.h[0]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [x8, :lo12:.LCPI14_0]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
@@ -384,11 +548,22 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI15_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI15_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
@@ -410,9 +585,24 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI16_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI16_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
@@ -434,10 +624,24 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI17_0
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI17_0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
@@ -461,10 +665,18 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[1], v1.s[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
@@ -486,8 +698,20 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.s[3], v1.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
@@ -509,9 +733,21 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s2, #5.00000000
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.s[3], v2.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
@@ -527,8 +763,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
@@ -550,8 +790,18 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
@@ -573,10 +823,19 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, #5.00000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index da408a11e784d4..cae906e9d6b33d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -20,7 +20,27 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -37,7 +57,43 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -54,7 +110,74 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -72,11 +195,143 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -96,7 +351,18 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -113,7 +379,27 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -130,7 +416,42 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -148,11 +469,79 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -172,7 +561,18 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -189,7 +589,24 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -207,11 +624,43 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -248,7 +697,17 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -266,11 +725,29 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -303,7 +780,27 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -329,7 +826,43 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -355,7 +888,74 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -384,11 +984,143 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -417,7 +1149,17 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -443,7 +1185,27 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -469,7 +1231,42 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -498,11 +1295,79 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -531,7 +1396,17 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -557,7 +1432,22 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -586,11 +1476,39 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -619,12 +1537,14 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -650,14 +1570,16 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x10, d1
-; NONEON-NOSVE-NEXT:    fmov x11, d0
-; NONEON-NOSVE-NEXT:    mov x8, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -686,25 +1608,27 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x12, d2
-; NONEON-NOSVE-NEXT:    mov x11, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov x9, d3
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
-; NONEON-NOSVE-NEXT:    mul x9, x12, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mul x11, x14, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -728,7 +1652,27 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -745,7 +1689,43 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -762,7 +1742,74 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -780,11 +1827,143 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -804,7 +1983,18 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -821,7 +2011,27 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -838,7 +2048,42 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -856,11 +2101,79 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -880,7 +2193,18 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -897,7 +2221,24 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -915,11 +2256,43 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -939,7 +2312,14 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub d0, d0, d1
+; NONEON-NOSVE-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    mov w9, #1 // =0x1
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #16
+; NONEON-NOSVE-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    add d0, d0, d3
+; NONEON-NOSVE-NEXT:    add d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -956,7 +2336,17 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -974,11 +2364,29 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1003,9 +2411,26 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cneg w8, w9, mi
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cneg w8, w10, mi
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cneg w8, w11, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
@@ -1022,7 +2447,42 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
@@ -1039,7 +2499,74 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
@@ -1057,10 +2584,140 @@ define void @abs_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    abs v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
@@ -1080,9 +2737,17 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w9, mi
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
@@ -1099,7 +2764,26 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
@@ -1116,7 +2800,42 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
@@ -1134,10 +2853,76 @@ define void @abs_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    abs v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
@@ -1156,7 +2941,17 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
@@ -1173,7 +2968,24 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
@@ -1191,10 +3003,40 @@ define void @abs_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
@@ -1213,7 +3055,14 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs d0, d0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
@@ -1230,7 +3079,17 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
@@ -1248,10 +3107,26 @@ define void @abs_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 3148d4f1677cd5..159cbc1a5e46de 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -22,7 +22,51 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
@@ -42,7 +86,90 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
@@ -64,11 +191,175 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -91,7 +382,31 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
@@ -111,7 +426,50 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
@@ -133,11 +491,95 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmeq v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -160,7 +602,19 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
@@ -180,7 +634,26 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -202,11 +675,47 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -229,7 +738,15 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
@@ -249,7 +766,18 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -271,11 +799,31 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -304,13 +852,175 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -337,10 +1047,53 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -369,11 +1122,95 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmgt v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -400,10 +1237,29 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
@@ -432,11 +1288,47 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -463,10 +1355,21 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hs
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hs
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -493,10 +1396,21 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -523,10 +1437,21 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, ls
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, ls
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -553,10 +1478,21 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 27a4924ea367cb..6d804197fedcad 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -28,27 +28,27 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
 ; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -80,41 +80,43 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -166,71 +168,74 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    sdiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -315,159 +320,143 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    smov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    smov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    smov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    sdiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    sdiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    smov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    sdiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    sdiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    smov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    sdiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    smov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    sdiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    sdiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    smov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    smov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    sdiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    sdiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    smov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    sdiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    smov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    smov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    sdiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    smov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    smov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -490,19 +479,18 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -523,25 +511,27 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -572,39 +562,42 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -649,75 +642,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    smov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    smov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    sdiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    sdiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    smov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    sdiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -738,17 +735,17 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -766,22 +763,22 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -801,41 +798,39 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -856,12 +851,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -879,14 +876,16 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -906,25 +905,27 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    sdiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -954,33 +955,27 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
 ; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
 ; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    and w9, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w11, w12, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -1012,41 +1007,43 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -1098,71 +1095,74 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    udiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -1247,159 +1247,143 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    umov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    umov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    umov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    udiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    udiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    udiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    umov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    udiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    udiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    umov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    udiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    umov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    udiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    udiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    udiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    umov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    udiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    umov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    udiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    udiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    umov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    udiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    umov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    umov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    udiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    umov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    umov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -1422,18 +1406,18 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -1454,25 +1438,27 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -1503,39 +1489,42 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -1580,75 +1569,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    umov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    udiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    umov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    udiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    udiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    umov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    udiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -1669,17 +1662,17 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -1697,22 +1690,22 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -1732,41 +1725,39 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1787,12 +1778,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -1810,14 +1803,16 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -1837,25 +1832,27 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    udiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1905,23 +1902,66 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
 ; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umull2 v3.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v4.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v5.2d, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v2.2s, v0.2s
-; NONEON-NOSVE-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v5.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    sub v2.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    usra v3.4s, v1.4s, #1
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #1
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v3.4s, #6
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #6
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    umull x8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w8
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #6
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index e320fed2a498de..c8e2aea821e525 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -30,18 +30,50 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    sbfx w8, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w15, #0, #1
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    sbfx w12, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    stp w12, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
@@ -73,17 +105,21 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x10, x10, #0, #3
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx x8, x11, #0, #3
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
@@ -106,13 +142,45 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
@@ -138,20 +206,206 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -177,14 +431,42 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -210,21 +492,75 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -263,36 +599,280 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -325,17 +905,19 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x11, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -362,22 +944,57 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -419,37 +1036,109 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -522,69 +1211,367 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    sshll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    sshll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #372]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -609,13 +1596,25 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -640,20 +1639,91 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -679,14 +1749,24 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -712,21 +1792,39 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -765,36 +1863,124 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -819,13 +2005,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -850,20 +2040,43 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -888,13 +2101,45 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
@@ -920,20 +2165,206 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -959,14 +2390,42 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -992,21 +2451,75 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -1045,36 +2558,280 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -1104,16 +2861,26 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1140,22 +2907,61 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -1197,37 +3003,129 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -1300,69 +3198,400 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    ushll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    ushll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #572]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #564]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #580]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #604]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #620]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #612]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #508]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #500]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #524]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #516]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #540]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #556]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #548]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #700]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #692]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #716]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #708]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #732]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #724]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #748]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #740]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #636]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #628]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #652]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #644]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #668]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #684]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #676]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -1387,13 +3616,25 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -1418,20 +3659,91 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -1457,14 +3769,26 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1490,21 +3814,43 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -1543,36 +3889,144 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -1597,13 +4051,19 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1628,20 +4088,47 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -1672,17 +4159,17 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ;
 ; NONEON-NOSVE-LABEL: extend_and_mul:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v1.2s, w0
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov w9, w0
+; NONEON-NOSVE-NEXT:    mul x10, x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -1702,9 +4189,12 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ;
 ; NONEON-NOSVE-LABEL: extend_no_mul:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    mov w8, w0
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index d86cfcbfb4f6e5..ce9250f9eaea1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -26,11 +26,108 @@ define void @add_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -51,12 +148,60 @@ define void @add_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -77,12 +222,32 @@ define void @add_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -103,12 +268,22 @@ define void @add_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -133,11 +308,108 @@ define void @and_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -158,12 +430,60 @@ define void @and_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -184,12 +504,32 @@ define void @and_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -210,12 +550,22 @@ define void @and_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -240,10 +590,108 @@ define void @ashr_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -264,10 +712,60 @@ define void @ashr_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -288,10 +786,32 @@ define void @ashr_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -312,10 +832,22 @@ define void @ashr_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -343,11 +875,140 @@ define void @icmp_eq_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -372,12 +1033,76 @@ define void @icmp_sge_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    cmge v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -402,12 +1127,40 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #-8 // =0xfffffff8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
@@ -432,12 +1185,26 @@ define void @icmp_ult_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -463,10 +1230,108 @@ define void @lshr_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    ushr v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -487,10 +1352,60 @@ define void @lshr_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    ushr v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -511,10 +1426,32 @@ define void @lshr_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -535,10 +1472,22 @@ define void @lshr_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -563,11 +1512,140 @@ define void @mul_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -588,12 +1666,76 @@ define void @mul_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mul v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -614,12 +1756,44 @@ define void @mul_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -640,24 +1814,28 @@ define void @mul_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    lsl x12, x10, #6
-; NONEON-NOSVE-NEXT:    lsl x13, x11, #6
-; NONEON-NOSVE-NEXT:    lsl x14, x8, #6
-; NONEON-NOSVE-NEXT:    sub x10, x12, x10
-; NONEON-NOSVE-NEXT:    sub x11, x13, x11
-; NONEON-NOSVE-NEXT:    lsl x12, x9, #6
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x11
-; NONEON-NOSVE-NEXT:    sub x8, x14, x8
-; NONEON-NOSVE-NEXT:    sub x9, x12, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x9
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -682,11 +1860,108 @@ define void @or_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -707,12 +1982,60 @@ define void @or_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -733,12 +2056,32 @@ define void @or_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -759,12 +2102,22 @@ define void @or_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -789,10 +2142,108 @@ define void @shl_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -813,10 +2264,60 @@ define void @shl_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    shl v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -837,10 +2338,32 @@ define void @shl_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -861,10 +2384,22 @@ define void @shl_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -889,11 +2424,141 @@ define void @smax_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -914,12 +2579,77 @@ define void @smax_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -940,12 +2670,41 @@ define void @smax_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -966,14 +2725,27 @@ define void @smax_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -998,11 +2770,141 @@ define void @smin_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1023,12 +2925,77 @@ define void @smin_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1049,12 +3016,41 @@ define void @smin_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1075,14 +3071,27 @@ define void @smin_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1107,11 +3116,108 @@ define void @sub_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1132,12 +3238,60 @@ define void @sub_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    sub v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1158,12 +3312,32 @@ define void @sub_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1184,12 +3358,22 @@ define void @sub_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1214,11 +3398,141 @@ define void @umax_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1239,12 +3553,77 @@ define void @umax_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1265,12 +3644,41 @@ define void @umax_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1291,14 +3699,27 @@ define void @umax_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1323,11 +3744,141 @@ define void @umin_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1348,12 +3899,77 @@ define void @umin_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1374,12 +3990,41 @@ define void @umin_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1400,14 +4045,27 @@ define void @umin_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1432,11 +4090,108 @@ define void @xor_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1457,12 +4212,60 @@ define void @xor_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1483,12 +4286,32 @@ define void @xor_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1509,12 +4332,22 @@ define void @xor_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index f0b39b275614d4..404d3d3bb86a65 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -20,7 +20,43 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -37,7 +73,74 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -55,11 +158,143 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -79,7 +314,27 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -96,7 +351,42 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -114,11 +404,79 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -138,7 +496,18 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -155,7 +524,24 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -173,11 +559,43 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -197,7 +615,14 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -214,7 +639,17 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -232,11 +667,29 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -260,7 +713,43 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -277,7 +766,74 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -295,11 +851,143 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -319,7 +1007,27 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -336,7 +1044,42 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -354,11 +1097,79 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -378,7 +1189,18 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -395,7 +1217,24 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -413,11 +1252,43 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -437,7 +1308,14 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -454,7 +1332,17 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -472,11 +1360,29 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -500,7 +1406,43 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -517,7 +1459,74 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -535,11 +1544,143 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -559,7 +1700,27 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -576,7 +1737,42 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -594,11 +1790,79 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -618,7 +1882,18 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -635,7 +1910,24 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -653,11 +1945,43 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -677,7 +2001,14 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -694,7 +2025,17 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -712,11 +2053,29 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 51c404ece6cd5e..44f7954de0a274 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -21,7 +21,51 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -39,7 +83,90 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -59,11 +186,175 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -84,7 +375,31 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -102,7 +417,50 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -122,11 +480,95 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -147,7 +589,19 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -165,7 +619,26 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -185,11 +658,47 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -211,8 +720,15 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -231,8 +747,18 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -252,14 +778,31 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -284,7 +827,51 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -302,7 +889,90 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -322,11 +992,175 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -347,7 +1181,31 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -365,7 +1223,50 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -385,11 +1286,95 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -410,7 +1395,19 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -428,7 +1425,26 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -448,11 +1464,47 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -474,8 +1526,15 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -494,8 +1553,18 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -515,14 +1584,31 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -547,7 +1633,51 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -565,7 +1695,90 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -585,11 +1798,175 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -610,7 +1987,31 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -628,7 +2029,50 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -648,11 +2092,95 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -673,7 +2201,19 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -691,7 +2231,26 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -711,11 +2270,47 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -737,8 +2332,15 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -757,8 +2359,18 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -778,14 +2390,31 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -810,7 +2439,51 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -828,7 +2501,90 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -848,11 +2604,175 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -873,7 +2793,31 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -891,7 +2835,50 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -911,11 +2898,95 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -936,7 +3007,19 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -954,7 +3037,26 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -974,11 +3076,47 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1000,8 +3138,15 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -1020,8 +3165,18 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -1041,14 +3196,31 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 83714152c173f5..4f89d652e4deb5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -24,8 +24,51 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ;
 ; NONEON-NOSVE-LABEL: mla8xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mla v2.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #7]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #6]
+; NONEON-NOSVE-NEXT:    madd w1, w2, w1, w5
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    madd w1, w4, w3, w1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    madd w18, w0, w18, w1
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    madd w16, w17, w16, w18
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #19]
+; NONEON-NOSVE-NEXT:    madd w14, w15, w14, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    madd w12, w13, w12, w14
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #17]
+; NONEON-NOSVE-NEXT:    madd w10, w11, w10, w12
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    madd w8, w9, w8, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 6e6d40e2ea040f..0060fde7dd3764 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -40,12 +40,31 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w9, w9, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #4, #12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w11, #4, #12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -77,8 +96,51 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -110,9 +172,116 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
@@ -145,15 +314,251 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    smull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    smull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -193,12 +598,20 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
@@ -228,8 +641,31 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
@@ -259,9 +695,54 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
@@ -294,15 +775,125 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    smull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    smull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrsh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrsh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrsh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrsh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrsh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -335,8 +926,18 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smull x9, w9, w10
+; NONEON-NOSVE-NEXT:    smull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
@@ -366,9 +967,28 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldpsw x12, x14, [sp, #56]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w14
+; NONEON-NOSVE-NEXT:    smull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
@@ -401,15 +1021,52 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    smull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    smull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x12, x13, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldpsw x14, x15, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x17, x16, [sp, #112]
+; NONEON-NOSVE-NEXT:    smull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #120]
+; NONEON-NOSVE-NEXT:    smull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldpsw x17, x1, [sp, #80]
+; NONEON-NOSVE-NEXT:    smull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    smull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #88]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w18
+; NONEON-NOSVE-NEXT:    smull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -442,12 +1099,14 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d0
 ; NONEON-NOSVE-NEXT:    fmov x9, d1
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
@@ -479,15 +1138,17 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
@@ -520,27 +1181,29 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    smulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -583,11 +1246,31 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #4
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #4
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #4
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
@@ -617,8 +1300,51 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
@@ -648,9 +1374,116 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
@@ -683,15 +1516,251 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    umull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    umull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -730,11 +1799,20 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
@@ -764,8 +1842,31 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
@@ -795,9 +1896,54 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
@@ -830,15 +1976,125 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    umull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    umull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -871,8 +2127,18 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x9, w9, w10
+; NONEON-NOSVE-NEXT:    umull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
@@ -902,9 +2168,28 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w13, w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldp w12, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w14
+; NONEON-NOSVE-NEXT:    umull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
@@ -937,15 +2222,52 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    umull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w17, w16, [sp, #112]
+; NONEON-NOSVE-NEXT:    umull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #120]
+; NONEON-NOSVE-NEXT:    umull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldp w17, w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    umull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    umull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w18
+; NONEON-NOSVE-NEXT:    umull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -980,12 +2302,14 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d0
 ; NONEON-NOSVE-NEXT:    fmov x9, d1
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
@@ -1015,15 +2339,17 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
@@ -1056,27 +2382,29 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    umulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 50eaa6c12d71e6..e7ab99ae37c1e7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -21,8 +21,25 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -40,8 +57,40 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w11, w14, w13
+; NONEON-NOSVE-NEXT:    add w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w12, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    add w9, w10, w9
+; NONEON-NOSVE-NEXT:    add w10, w12, w16
+; NONEON-NOSVE-NEXT:    add w8, w8, w15
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w13
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -61,9 +110,72 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w11, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w14, w15, w14
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w9, w9, w14
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w16, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    add w11, w17, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -82,8 +194,17 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -101,8 +222,24 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -122,9 +259,40 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w13, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    add w11, w15, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w13, w12
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -143,8 +311,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -162,8 +334,13 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -183,9 +360,20 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w9, w11, w9
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w14, w12
+; NONEON-NOSVE-NEXT:    add w11, w15, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -203,8 +391,10 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -223,9 +413,13 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    add x9, x11, x9
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
@@ -247,8 +441,32 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -265,8 +483,55 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -285,9 +550,103 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
@@ -305,8 +664,20 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -323,8 +694,31 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -343,9 +737,55 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
@@ -363,8 +803,13 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -381,8 +826,17 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -401,9 +855,27 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
@@ -424,11 +896,9 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -447,15 +917,17 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, gt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, gt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
@@ -477,8 +949,32 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -495,8 +991,55 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -515,9 +1058,103 @@ define i8 @sminv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
@@ -535,8 +1172,20 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -553,8 +1202,31 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -573,9 +1245,55 @@ define i16 @sminv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
@@ -593,8 +1311,13 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -611,8 +1334,17 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -631,9 +1363,27 @@ define i32 @sminv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
@@ -654,11 +1404,9 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -676,16 +1424,18 @@ define i64 @sminv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
@@ -707,8 +1457,32 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -725,8 +1499,55 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -745,9 +1566,103 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
@@ -765,8 +1680,20 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -783,8 +1710,31 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -803,9 +1753,55 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
@@ -823,8 +1819,13 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -841,8 +1842,17 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -861,9 +1871,27 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
@@ -884,11 +1912,9 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -907,15 +1933,17 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, hi
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
@@ -937,8 +1965,32 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -955,8 +2007,55 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -975,9 +2074,103 @@ define i8 @uminv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
@@ -995,8 +2188,20 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -1013,8 +2218,31 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -1033,9 +2261,55 @@ define i16 @uminv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
@@ -1053,8 +2327,13 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -1071,8 +2350,17 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -1091,9 +2379,27 @@ define i32 @uminv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
@@ -1114,11 +2420,9 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -1136,16 +2440,18 @@ define i64 @uminv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lo
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 97bd76311b61c3..b1b7ceda9ebcce 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -28,31 +28,31 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #10]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -86,49 +86,51 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -182,108 +184,90 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    smov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    smov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    smov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -375,275 +359,175 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    smov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    smov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    smov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    smov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    sdiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    smov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    smov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    sdiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    smov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -669,29 +553,31 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -724,47 +610,50 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -813,135 +702,95 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    smov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    smov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    smov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    smov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    sdiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    smov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    sdiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    sdiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    sdiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    sdiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -964,19 +813,20 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -996,26 +846,28 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -1039,61 +891,50 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    sdiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    sdiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1116,13 +957,15 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -1142,16 +985,19 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -1175,29 +1021,33 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x16, x15, x14
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    sdiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1229,37 +1079,31 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w12, w12, #0xff
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
 ; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w14, w14, #0xff
-; NONEON-NOSVE-NEXT:    and w15, w15, #0xff
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    and w12, w17, #0xff
-; NONEON-NOSVE-NEXT:    and w13, w18, #0xff
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w12, w13
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -1293,49 +1137,51 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -1389,108 +1235,90 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    umov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    umov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    umov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    udiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -1582,275 +1410,175 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    umov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    umov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    umov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    umov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    udiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    udiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    udiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    udiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    umov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    umov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    udiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    umov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -1876,29 +1604,31 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -1931,47 +1661,50 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -2020,135 +1753,95 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    umov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    umov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    umov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    umov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    udiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    umov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    udiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    udiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    udiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    udiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -2171,19 +1864,20 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -2203,26 +1897,28 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -2246,61 +1942,50 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    udiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    udiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    udiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -2323,13 +2008,15 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -2349,16 +2036,19 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -2382,29 +2072,33 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x16, x15, x14
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    udiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index b3adf4720ece8f..019da3710f11b0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -20,10 +20,28 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
@@ -43,10 +61,44 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8b, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
@@ -66,10 +118,75 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.16b, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
@@ -92,16 +209,147 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.16b, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
@@ -125,10 +373,18 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
@@ -149,10 +405,28 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
@@ -173,10 +447,43 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
@@ -200,16 +507,83 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
@@ -233,10 +607,18 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
@@ -257,10 +639,23 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
@@ -284,16 +679,43 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
@@ -318,10 +740,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
@@ -343,10 +770,17 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
@@ -371,16 +805,31 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index a429cd82a44993..1cbf2887166c70 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -23,12 +23,27 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w10, w11, w10
+; NONEON-NOSVE-NEXT:    asr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -46,8 +61,43 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -65,8 +115,74 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -86,13 +202,143 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sshl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -115,12 +361,18 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    asr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -138,8 +390,27 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -157,8 +428,42 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -178,13 +483,79 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sshl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -205,8 +576,17 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -224,8 +604,22 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -245,13 +639,39 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -272,8 +692,14 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    sshl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -291,8 +717,16 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -312,13 +746,27 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -345,11 +793,27 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w10, w11, w10
+; NONEON-NOSVE-NEXT:    lsr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -367,8 +831,43 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -386,8 +885,74 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -407,13 +972,143 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -436,11 +1131,18 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -458,8 +1160,27 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -477,8 +1198,42 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -498,13 +1253,79 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -525,8 +1346,17 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -544,8 +1374,22 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -565,13 +1409,39 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -592,8 +1462,14 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -611,8 +1487,16 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -632,13 +1516,27 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -664,9 +1562,18 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x0000ff000000ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
@@ -685,9 +1592,27 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w10, w9
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -705,7 +1630,43 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -723,7 +1684,74 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -743,11 +1771,143 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -768,7 +1928,27 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -786,7 +1966,42 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -806,11 +2021,79 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -831,7 +2114,17 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -849,7 +2142,22 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -869,11 +2177,39 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -894,7 +2230,14 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -912,7 +2255,16 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -932,11 +2284,27 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index d9ca19baea7d5b..51dc17fe48e2e5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -19,9 +19,26 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
@@ -39,17 +56,43 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
@@ -69,25 +112,76 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
@@ -111,9 +205,15 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
@@ -131,8 +231,21 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
@@ -154,15 +267,33 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
@@ -192,21 +323,57 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
@@ -229,9 +396,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
@@ -250,10 +421,16 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
@@ -275,17 +452,31 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
@@ -317,26 +508,53 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
@@ -389,42 +607,99 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    ucvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
@@ -448,9 +723,18 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
@@ -468,8 +752,24 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
@@ -491,11 +791,39 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
@@ -524,17 +852,72 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
@@ -557,7 +940,14 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
@@ -574,7 +964,18 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
@@ -592,10 +993,28 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
@@ -619,8 +1038,16 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
@@ -642,15 +1069,23 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
@@ -680,21 +1115,37 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
@@ -725,14 +1176,17 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ucvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
@@ -757,12 +1211,25 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
@@ -800,18 +1267,43 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v2.2s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.4s, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
@@ -834,8 +1326,14 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -857,11 +1355,19 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
@@ -890,17 +1396,32 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
@@ -923,7 +1444,14 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
@@ -941,10 +1469,20 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
@@ -967,9 +1505,26 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
@@ -987,17 +1542,43 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
@@ -1017,25 +1598,76 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
@@ -1058,9 +1690,15 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1078,8 +1716,21 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1101,15 +1752,33 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
@@ -1139,21 +1808,57 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
@@ -1179,10 +1884,16 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1204,17 +1915,29 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
@@ -1246,26 +1969,49 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
@@ -1318,42 +2064,92 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
@@ -1377,9 +2173,18 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
@@ -1397,8 +2202,24 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1420,11 +2241,39 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
@@ -1446,7 +2295,14 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1463,7 +2319,18 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1481,10 +2348,28 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
@@ -1508,8 +2393,15 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1531,15 +2423,21 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
@@ -1569,21 +2467,33 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
@@ -1632,36 +2542,68 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-64]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q3, q5, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d2, w9
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf d0, w9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp q4, q6, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q7, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #160]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr q5, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
@@ -1692,14 +2634,17 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    scvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
@@ -1724,12 +2669,25 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
@@ -1752,8 +2710,14 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1775,11 +2739,19 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
@@ -1801,7 +2773,14 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1819,10 +2798,20 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 42daa4fedc949b..e419293b989446 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -22,9 +22,40 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
@@ -47,9 +78,68 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.8b, v2.8b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.8b, v2.8b, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
@@ -72,9 +162,124 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ;
 ; NONEON-NOSVE-LABEL: select_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.16b, v2.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v2.16b, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w2, w2, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w4, w4, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w3, w3, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w2, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w1, w1, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w2, w6, w5, ne
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w4, #0xff
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w0, w0, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w18, w18, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w17, w17, #0, #1
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, ne
+; NONEON-NOSVE-NEXT:    tst w3, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, ne
+; NONEON-NOSVE-NEXT:    tst w1, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w16, w16, #0, #1
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #28]
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #36]
+; NONEON-NOSVE-NEXT:    csel w1, w3, w2, ne
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w0, #0xff
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #27]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csel w0, w2, w1, ne
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w18, #0xff
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #26]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w18, w1, w0, ne
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w17, #0xff
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #25]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w17, w0, w18, ne
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w16, #0xff
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, ne
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w15, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w15, ne
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w14, ne
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
@@ -95,14 +300,204 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cmeq v5.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #4] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w12, w1, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w2, w13
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w13, w2, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w16, w1, w18, eq
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #42]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    csel w18, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w6, w1
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #45]
+; NONEON-NOSVE-NEXT:    csel w1, w6, w1, eq
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #29]
+; NONEON-NOSVE-NEXT:    str w8, [sp] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w19, w6
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    csel w5, w19, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w6, w30, w29, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #49]
+; NONEON-NOSVE-NEXT:    csel w19, w8, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w21
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    csel w21, w10, w21, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w22
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    csel w22, w11, w22, eq
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w11, w29, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w27
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #53]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w27, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w26
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #54]
+; NONEON-NOSVE-NEXT:    csel w9, w9, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w25
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #55]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w25, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w24
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w24, w28, w24, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w23
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #57]
+; NONEON-NOSVE-NEXT:    csel w23, w27, w23, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w20
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #58]
+; NONEON-NOSVE-NEXT:    csel w20, w26, w20, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w7
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #59]
+; NONEON-NOSVE-NEXT:    csel w7, w25, w7, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w4
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #60]
+; NONEON-NOSVE-NEXT:    csel w4, w28, w4, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w3
+; NONEON-NOSVE-NEXT:    csel w3, w27, w3, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w17
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #61]
+; NONEON-NOSVE-NEXT:    csel w17, w26, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w15
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #78]
+; NONEON-NOSVE-NEXT:    csel w15, w25, w15, eq
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w27, w28
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    csel w27, w27, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    csel w25, w25, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    csel w26, w30, w29, eq
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w26, [sp, #111]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w25, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w27, [sp, #109]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    csel w8, w29, w28, eq
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #106]
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #105]
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #103]
+; NONEON-NOSVE-NEXT:    strb w23, [sp, #102]
+; NONEON-NOSVE-NEXT:    strb w24, [sp, #101]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #100]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #97]
+; NONEON-NOSVE-NEXT:    strb w22, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #87]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #85]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #80]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -129,9 +524,25 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
@@ -154,9 +565,40 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
@@ -180,10 +622,68 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w15, #0xffff
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
@@ -204,14 +704,98 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    cmeq v5.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w14
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w14, w15, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w17, w16
+; NONEON-NOSVE-NEXT:    csel w16, w17, w16, eq
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w17
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w17, w1, w17, eq
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w4, w3
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w3, w4, w3, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    cmp w5, w1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #38]
+; NONEON-NOSVE-NEXT:    csel w1, w5, w1, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w6
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w6, w7, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w2
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #42]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, eq
+; NONEON-NOSVE-NEXT:    cmp w19, w13
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    csel w13, w19, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w5, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    csel w18, w5, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w15
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    csel w15, w7, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    csel w11, w4, w11, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w19, w10
+; NONEON-NOSVE-NEXT:    csel w10, w19, w10, eq
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    cmp w5, w4
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #94]
+; NONEON-NOSVE-NEXT:    csel w8, w5, w4, eq
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #90]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w3, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w17, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -238,9 +822,25 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
@@ -264,10 +864,40 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
@@ -288,14 +918,43 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldp w12, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldp w15, w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w14, w11
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w11, w14, w11, eq
+; NONEON-NOSVE-NEXT:    ldp w17, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w18, w1, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w10, w15
+; NONEON-NOSVE-NEXT:    stp w12, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w16
+; NONEON-NOSVE-NEXT:    ldr w15, [sp]
+; NONEON-NOSVE-NEXT:    csel w13, w13, w16, eq
+; NONEON-NOSVE-NEXT:    cmp w18, w17
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w14
+; NONEON-NOSVE-NEXT:    stp w10, w13, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w10, w1, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w8
+; NONEON-NOSVE-NEXT:    csel w8, w15, w8, eq
+; NONEON-NOSVE-NEXT:    stp w16, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -321,10 +980,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
@@ -348,10 +1012,25 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp, #8]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    csel x8, x11, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x10, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x10, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
@@ -372,14 +1051,30 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x10, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp]
+; NONEON-NOSVE-NEXT:    cmp x13, x12
+; NONEON-NOSVE-NEXT:    csel x12, x13, x12, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    stp x9, x12, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 318285ded5a824..b4f832ff7031da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -33,19 +33,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #12
+; NONEON-NOSVE-NEXT:    add x0, sp, #28
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[0]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    strb w8, [x19, #1]
 ; NONEON-NOSVE-NEXT:    strb w9, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [4 x i8]
   call void @def(ptr %alloc)
@@ -88,21 +92,25 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v6i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #8
+; NONEON-NOSVE-NEXT:    add x0, sp, #24
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x9, x19, #2
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    xtn v1.8b, v1.8h
-; NONEON-NOSVE-NEXT:    str s1, [sp, #4]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    st1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    strh w8, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #2]
+; NONEON-NOSVE-NEXT:    strh w9, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [6 x i8]
   call void @def(ptr %alloc)
@@ -135,18 +143,38 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #48
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #64
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    add x8, x19, #8
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    st1 { v1.b }[0], [x8]
-; NONEON-NOSVE-NEXT:    str d0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [32 x i8]
   call void @def(ptr %alloc)
@@ -179,18 +207,26 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #160] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #96
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp]
-; NONEON-NOSVE-NEXT:    zip1 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [8 x double]
   call void @def(ptr %alloc)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 01a7a5cafd26b6..22fe1dc0cbec7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -22,15 +22,68 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ;
 ; NONEON-NOSVE-LABEL: test:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v5.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    dup v0.4s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
@@ -59,15 +112,71 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ;
 ; NONEON-NOSVE-LABEL: test2:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    dup v0.2s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index c57f3af0d4b60f..b3ef90e855c9ba 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -15,9 +15,18 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
@@ -75,11 +84,14 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
@@ -93,7 +105,12 @@ define <2 x half> @load_v2f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index 65c45587e1203e..41f486854343db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -21,10 +21,17 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -41,11 +48,25 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -64,13 +85,37 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w11, w14, w13
+; NONEON-NOSVE-NEXT:    and w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w12, w14
+; NONEON-NOSVE-NEXT:    and w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    and w10, w12, w16
+; NONEON-NOSVE-NEXT:    and w8, w8, w15
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
@@ -90,17 +135,72 @@ define i8 @andv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w11, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w14, w15, w14
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w9, w9, w14
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w10, w10, w12
+; NONEON-NOSVE-NEXT:    and w11, w16, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    and w11, w17, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
@@ -118,9 +218,12 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -137,10 +240,17 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -159,11 +269,20 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -184,16 +303,40 @@ define i16 @andv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w13, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w9, w12, w13
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    and w11, w15, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w13, w12
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
@@ -211,9 +354,12 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -232,12 +378,11 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -256,15 +401,20 @@ define i32 @andv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    and w9, w11, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w14, w12
+; NONEON-NOSVE-NEXT:    and w11, w15, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
@@ -284,10 +434,8 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -306,13 +454,13 @@ define i64 @andv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    and x9, x11, x9
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
@@ -334,10 +482,17 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -354,11 +509,25 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -377,13 +546,37 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w11, w14, w13
+; NONEON-NOSVE-NEXT:    eor w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w14
+; NONEON-NOSVE-NEXT:    eor w8, w8, w11
+; NONEON-NOSVE-NEXT:    eor w9, w10, w9
+; NONEON-NOSVE-NEXT:    eor w10, w12, w16
+; NONEON-NOSVE-NEXT:    eor w8, w8, w15
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w13
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
@@ -403,17 +596,72 @@ define i8 @eorv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w11, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w14, w15, w14
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w14
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w10, w10, w12
+; NONEON-NOSVE-NEXT:    eor w11, w16, w11
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    eor w11, w17, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
@@ -431,9 +679,12 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -450,10 +701,17 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -472,11 +730,20 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -497,16 +764,40 @@ define i16 @eorv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w13, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w9, w12, w13
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    eor w11, w15, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w13, w12
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
@@ -524,9 +815,12 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -545,12 +839,11 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -569,15 +862,20 @@ define i32 @eorv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    eor w9, w11, w9
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w14, w12
+; NONEON-NOSVE-NEXT:    eor w11, w15, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
@@ -597,10 +895,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -619,13 +915,13 @@ define i64 @eorv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    eor x9, x11, x9
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
@@ -647,10 +943,17 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -667,11 +970,25 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -690,13 +1007,37 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w11, w14, w13
+; NONEON-NOSVE-NEXT:    orr w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w14
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    orr w10, w12, w16
+; NONEON-NOSVE-NEXT:    orr w8, w8, w15
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
@@ -716,17 +1057,72 @@ define i8 @orv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w11, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w14, w15, w14
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w14
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w10, w10, w12
+; NONEON-NOSVE-NEXT:    orr w11, w16, w11
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w11, w17, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
@@ -744,9 +1140,12 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -763,10 +1162,17 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -785,11 +1191,20 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -810,16 +1225,40 @@ define i16 @orv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w13, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w9, w12, w13
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    orr w11, w15, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w13, w12
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
@@ -837,9 +1276,12 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -858,12 +1300,11 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -882,15 +1323,20 @@ define i32 @orv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w14, w12
+; NONEON-NOSVE-NEXT:    orr w11, w15, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
@@ -910,10 +1356,8 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -932,13 +1376,13 @@ define i64 @orv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    orr x9, x11, x9
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 886f97ed988d81..de017e919466cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -23,40 +23,83 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[0], [x0]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #110]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
 ; NONEON-NOSVE-NEXT:    b .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI0_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
 ; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
 ; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
 ; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
@@ -76,64 +119,183 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB1_2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    add x9, sp, #176
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #243]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #240]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #247]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB1_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #239]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #61]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #57]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #232]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
 ; NONEON-NOSVE-NEXT:    b .LBB1_4
 ; NONEON-NOSVE-NEXT:  .LBB1_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x10, :lo12:.LCPI1_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
 ; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #222]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #34]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #42]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #217]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #216]
 ; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_12
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_13
 ; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_14
 ; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_15
 ; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_16
 ; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
-; NONEON-NOSVE-NEXT:  .LBB1_10: // %else20
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #7]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #183]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #191]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #3]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #11]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #184]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
-; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #155]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #152]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
-; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #119]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #127]
+; NONEON-NOSVE-NEXT:    ldurh w9, [sp, #117]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    sturh w9, [sp, #125]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #120]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
-; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #5]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
-; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_10
-; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_10
+; NONEON-NOSVE-NEXT:    b .LBB1_11
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -152,112 +314,413 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h1, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
-; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
-; NONEON-NOSVE-NEXT:  .LBB2_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
-; NONEON-NOSVE-NEXT:  .LBB2_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
-; NONEON-NOSVE-NEXT:  .LBB2_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
-; NONEON-NOSVE-NEXT:  .LBB2_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
-; NONEON-NOSVE-NEXT:  .LBB2_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
-; NONEON-NOSVE-NEXT:  .LBB2_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
-; NONEON-NOSVE-NEXT:  .LBB2_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
-; NONEON-NOSVE-NEXT:  .LBB2_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
-; NONEON-NOSVE-NEXT:  .LBB2_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
-; NONEON-NOSVE-NEXT:  .LBB2_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
-; NONEON-NOSVE-NEXT:  .LBB2_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
-; NONEON-NOSVE-NEXT:  .LBB2_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
-; NONEON-NOSVE-NEXT:  .LBB2_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
-; NONEON-NOSVE-NEXT:  .LBB2_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
-; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
-; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
-; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
-; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
-; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
-; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
-; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
-; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
-; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
-; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
-; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
-; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
-; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
-; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
-; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB2_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #975]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_3
+; NONEON-NOSVE-NEXT:    b .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_17
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_18
+; NONEON-NOSVE-NEXT:    b .LBB2_19
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -342,274 +805,815 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
-; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
-; NONEON-NOSVE-NEXT:  .LBB3_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
-; NONEON-NOSVE-NEXT:  .LBB3_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
-; NONEON-NOSVE-NEXT:  .LBB3_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
-; NONEON-NOSVE-NEXT:  .LBB3_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
-; NONEON-NOSVE-NEXT:  .LBB3_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
-; NONEON-NOSVE-NEXT:  .LBB3_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
-; NONEON-NOSVE-NEXT:  .LBB3_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
-; NONEON-NOSVE-NEXT:  .LBB3_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
-; NONEON-NOSVE-NEXT:  .LBB3_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
-; NONEON-NOSVE-NEXT:  .LBB3_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
-; NONEON-NOSVE-NEXT:  .LBB3_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
-; NONEON-NOSVE-NEXT:  .LBB3_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
-; NONEON-NOSVE-NEXT:  .LBB3_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
-; NONEON-NOSVE-NEXT:  .LBB3_16: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
-; NONEON-NOSVE-NEXT:  .LBB3_17: // %else47
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
-; NONEON-NOSVE-NEXT:  .LBB3_18: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
-; NONEON-NOSVE-NEXT:  .LBB3_19: // %else53
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
-; NONEON-NOSVE-NEXT:  .LBB3_20: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
-; NONEON-NOSVE-NEXT:  .LBB3_21: // %else59
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
-; NONEON-NOSVE-NEXT:  .LBB3_22: // %else62
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
-; NONEON-NOSVE-NEXT:  .LBB3_23: // %else65
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
-; NONEON-NOSVE-NEXT:  .LBB3_24: // %else68
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
-; NONEON-NOSVE-NEXT:  .LBB3_25: // %else71
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
-; NONEON-NOSVE-NEXT:  .LBB3_26: // %else74
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
-; NONEON-NOSVE-NEXT:  .LBB3_27: // %else77
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
-; NONEON-NOSVE-NEXT:  .LBB3_28: // %else80
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
-; NONEON-NOSVE-NEXT:  .LBB3_29: // %else83
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
-; NONEON-NOSVE-NEXT:  .LBB3_30: // %else86
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
-; NONEON-NOSVE-NEXT:  .LBB3_31: // %else89
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else92
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x9, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load46
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load49
-; NONEON-NOSVE-NEXT:    add x9, x0, #17
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load52
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load55
-; NONEON-NOSVE-NEXT:    add x9, x0, #19
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load58
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load61
-; NONEON-NOSVE-NEXT:    add x9, x0, #21
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load64
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load67
-; NONEON-NOSVE-NEXT:    add x9, x0, #23
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load70
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load73
-; NONEON-NOSVE-NEXT:    add x9, x0, #25
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load76
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load79
-; NONEON-NOSVE-NEXT:    add x9, x0, #27
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load82
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load85
-; NONEON-NOSVE-NEXT:    add x9, x0, #29
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load88
-; NONEON-NOSVE-NEXT:    add x9, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load91
-; NONEON-NOSVE-NEXT:    add x8, x0, #31
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #2064
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2024]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2016]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2031]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2030]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2029]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2028]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2027]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2026]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2025]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2023]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2184]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2021]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2020]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2019]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2088]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2018]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2017]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2008]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2080]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2016]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2015]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2014]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2013]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2096]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2012]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2011]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2010]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2009]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2007]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2006]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2005]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2004]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2003]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2002]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2001]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2050]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2052]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2054]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2056]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2058]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2060]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2034]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2036]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #2038]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #2040]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #2042]
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2044]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2062]
+; NONEON-NOSVE-NEXT:    add w13, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    add w10, w10, w13
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2046]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w12
+; NONEON-NOSVE-NEXT:    add w8, w9, w13
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_0
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1744
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB3_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #1999]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1984]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1984]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_3
+; NONEON-NOSVE-NEXT:    b .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1968]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1950]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1968]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1966]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1920]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1952]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1920]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1953]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1952]
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else47
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else59
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else62
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else65
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else68
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else71
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else74
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else77
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else80
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else83
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else86
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else89
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load91
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #31]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %else92
+; NONEON-NOSVE-NEXT:    add sp, sp, #2064
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1904]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1887]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1904]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1903]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1856]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1888]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1856]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1890]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1888]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1840]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1820]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1840]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1836]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1792]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1810]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1826]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1824]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1792]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1827]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1824]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1744]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1776]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1759]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1776]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1775]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1728]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1744]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1760]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1728]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1764]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1760]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1680]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1712]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1694]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1712]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1710]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1664]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1684]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1700]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1680]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1696]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1664]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1701]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1696]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1616]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1648]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1631]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1648]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1647]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1600]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1620]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1636]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1616]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1632]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1600]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1638]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1632]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1552]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1584]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1584]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1576]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1558]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1574]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1556]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1572]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1552]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1568]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1575]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1568]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1488]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1520]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1503]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1520]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1519]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1472]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1488]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1504]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1472]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1512]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1504]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1424]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1456]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1438]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1456]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1454]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1408]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1432]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1448]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1424]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1440]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1449]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1440]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1360]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1392]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1375]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1392]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1391]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1344]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1368]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1384]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1360]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1376]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1344]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1386]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1376]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1296]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1328]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1308]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1328]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1324]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1306]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1322]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1304]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1320]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1296]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1312]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1323]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1312]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1232]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1264]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1247]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1264]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1263]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1216]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1256]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1232]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1216]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1260]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1248]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1200]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1182]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1200]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1198]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1180]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1192]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1168]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1197]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1184]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1119]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1135]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1116]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1132]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1128]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1104]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1134]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1120]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1024]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1072]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1038]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1072]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1070]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1036]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1068]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1032]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1064]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1024]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1056]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1071]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1056]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load46
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #976]
+; NONEON-NOSVE-NEXT:    add x10, sp, #976
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #991]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #1008]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1007]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldurh w9, [x10, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x10, #9]
+; NONEON-NOSVE-NEXT:    sturh w9, [x10, #29]
+; NONEON-NOSVE-NEXT:    ldur x9, [x10, #1]
+; NONEON-NOSVE-NEXT:    stur w11, [x10, #25]
+; NONEON-NOSVE-NEXT:    stur x9, [x10, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #960]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #992]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load49
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #17]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #928]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load52
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load55
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #19]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load58
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load61
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #21]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load64
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load67
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #23]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load70
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load73
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #25]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load76
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load79
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #27]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load82
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load85
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #29]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load88
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_34
+; NONEON-NOSVE-NEXT:    b .LBB3_35
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -638,27 +1642,36 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
-; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
-; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB4_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_3
+; NONEON-NOSVE-NEXT:    b .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
@@ -678,39 +1691,84 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
-; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB5_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_3
+; NONEON-NOSVE-NEXT:    b .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
-; NONEON-NOSVE-NEXT:  .LBB5_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
-; NONEON-NOSVE-NEXT:  .LBB5_4: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
-; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
 ; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_6
 ; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
@@ -731,62 +1789,184 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b1, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
-; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
-; NONEON-NOSVE-NEXT:  .LBB6_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
-; NONEON-NOSVE-NEXT:  .LBB6_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
-; NONEON-NOSVE-NEXT:  .LBB6_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
-; NONEON-NOSVE-NEXT:  .LBB6_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
-; NONEON-NOSVE-NEXT:  .LBB6_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
-; NONEON-NOSVE-NEXT:  .LBB6_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
-; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
-; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
-; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
-; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
-; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
-; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
-; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB6_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_3
+; NONEON-NOSVE-NEXT:    b .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_9
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_10
+; NONEON-NOSVE-NEXT:    b .LBB6_11
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -814,113 +1994,383 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
-; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
-; NONEON-NOSVE-NEXT:  .LBB7_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
-; NONEON-NOSVE-NEXT:  .LBB7_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
-; NONEON-NOSVE-NEXT:  .LBB7_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
-; NONEON-NOSVE-NEXT:  .LBB7_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
-; NONEON-NOSVE-NEXT:  .LBB7_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
-; NONEON-NOSVE-NEXT:  .LBB7_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
-; NONEON-NOSVE-NEXT:  .LBB7_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
-; NONEON-NOSVE-NEXT:  .LBB7_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
-; NONEON-NOSVE-NEXT:  .LBB7_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
-; NONEON-NOSVE-NEXT:  .LBB7_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
-; NONEON-NOSVE-NEXT:  .LBB7_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
-; NONEON-NOSVE-NEXT:  .LBB7_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
-; NONEON-NOSVE-NEXT:  .LBB7_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
-; NONEON-NOSVE-NEXT:  .LBB7_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
-; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
-; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
-; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
-; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
-; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
-; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
-; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
-; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
-; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
-; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
-; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
-; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
-; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
-; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
-; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w12
+; NONEON-NOSVE-NEXT:    add w11, w13, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w10, w11, w15
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB7_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h2, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #960]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #974]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_3
+; NONEON-NOSVE-NEXT:    b .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #924]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #940]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #912]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #896]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #930]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #862]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #878]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #832]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #868]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #792]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #808]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #788]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #804]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #768]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #806]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #734]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #750]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #704]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #736]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #668]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #656]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #664]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #672]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #640]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #600]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #592]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #606]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #608]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #622]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #576]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #520]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #512]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #524]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #544]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #528]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    str h2, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #478]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #494]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #2]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_17
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_18
+; NONEON-NOSVE-NEXT:    b .LBB7_19
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -939,27 +2389,38 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_4
-; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB8_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
-; NONEON-NOSVE-NEXT:  .LBB8_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_3
+; NONEON-NOSVE-NEXT:    b .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
@@ -980,37 +2441,80 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_6
-; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #208]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB9_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #204]
+; NONEON-NOSVE-NEXT:    stur xzr, [sp, #196]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_3
+; NONEON-NOSVE-NEXT:    b .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
-; NONEON-NOSVE-NEXT:  .LBB9_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
-; NONEON-NOSVE-NEXT:  .LBB9_4: // %else8
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB9_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
-; NONEON-NOSVE-NEXT:  .LBB9_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
 ; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_6
 ; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
@@ -1064,63 +2568,170 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv b2, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_10
-; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_11
-; NONEON-NOSVE-NEXT:  .LBB10_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_12
-; NONEON-NOSVE-NEXT:  .LBB10_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_13
-; NONEON-NOSVE-NEXT:  .LBB10_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_14
-; NONEON-NOSVE-NEXT:  .LBB10_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_15
-; NONEON-NOSVE-NEXT:  .LBB10_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_16
-; NONEON-NOSVE-NEXT:  .LBB10_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB10_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w9, w9, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w9, w10
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB10_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
-; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB10_3
-; NONEON-NOSVE-NEXT:  .LBB10_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_4
-; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_5
-; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_6
-; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_7
-; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_8
-; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #460]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #244]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_3
+; NONEON-NOSVE-NEXT:    b .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #412]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #408]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_13
+; NONEON-NOSVE-NEXT:  .LBB10_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_14
+; NONEON-NOSVE-NEXT:  .LBB10_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_15
+; NONEON-NOSVE-NEXT:  .LBB10_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_16
+; NONEON-NOSVE-NEXT:  .LBB10_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_11
+; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB10_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #348]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #340]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_6
+; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #256]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #264]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_7
+; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #220]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #4]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #20]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_8
+; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_9
+; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s1, s3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_10
+; NONEON-NOSVE-NEXT:    b .LBB10_11
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -1140,25 +2751,38 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_4
-; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB11_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
-; NONEON-NOSVE-NEXT:  .LBB11_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_3
+; NONEON-NOSVE-NEXT:    b .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
@@ -1188,38 +2812,74 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI12_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB12_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_6
-; NONEON-NOSVE-NEXT:  .LBB12_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI12_0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI12_0]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x4
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w12, #0, #1
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB12_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_3
+; NONEON-NOSVE-NEXT:    b .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB12_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
-; NONEON-NOSVE-NEXT:  .LBB12_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
-; NONEON-NOSVE-NEXT:  .LBB12_4: // %else8
+; NONEON-NOSVE-NEXT:  .LBB12_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB12_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_2
-; NONEON-NOSVE-NEXT:  .LBB12_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB12_3
 ; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_4
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_6
 ; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
@@ -1249,34 +2909,51 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
 ; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
 ; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
 ; NONEON-NOSVE-NEXT:    b .LBB13_4
 ; NONEON-NOSVE-NEXT:  .LBB13_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI13_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI13_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
 ; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
@@ -1307,34 +2984,51 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
 ; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
 ; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
 ; NONEON-NOSVE-NEXT:    b .LBB14_4
 ; NONEON-NOSVE-NEXT:  .LBB14_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI14_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI14_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
 ; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index b175dcf3e9a0d4..c4da40af88590f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -23,13 +23,21 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
@@ -38,6 +46,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
 ; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -50,6 +59,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -69,14 +79,39 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB1_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB1_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
 ; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
@@ -92,6 +127,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
 ; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -116,6 +152,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
 ; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -135,15 +172,89 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
@@ -176,6 +287,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
 ; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -224,6 +336,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
 ; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -308,241 +421,328 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
 ; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w12, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w13
+; NONEON-NOSVE-NEXT:    add w8, w9, w12
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_34
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_35
 ; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
 ; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
 ; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
 ; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
 ; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
 ; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
 ; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
 ; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
 ; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
 ; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
 ; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
 ; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
 ; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
 ; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
 ; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
 ; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
 ; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
 ; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
 ; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
 ; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
 ; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
 ; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
 ; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
 ; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
 ; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
 ; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
 ; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
 ; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
 ; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
 ; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else62
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store1
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store3
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store7
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store9
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store11
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
 ; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store15
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
 ; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store17
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
 ; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store19
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
 ; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store21
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
 ; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store23
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
 ; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store25
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
 ; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store27
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
 ; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store31
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store33
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
 ; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store35
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
 ; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store37
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
 ; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store39
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
 ; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store41
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
 ; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store43
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
 ; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store45
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
 ; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store47
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
 ; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store49
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
 ; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store51
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
 ; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store53
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
 ; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store55
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
 ; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store57
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
 ; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store59
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:    b .LBB3_33
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -571,17 +771,18 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
 ; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -590,6 +791,7 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
@@ -609,13 +811,21 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
@@ -624,6 +834,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
 ; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -640,6 +851,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -660,14 +872,39 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB6_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
 ; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
@@ -683,6 +920,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
 ; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -715,6 +953,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -743,15 +982,89 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
@@ -784,6 +1097,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
 ; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -848,6 +1162,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -868,13 +1183,21 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
@@ -883,6 +1206,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
 ; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    str wzr, [x0]
@@ -895,6 +1219,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
 ; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -949,14 +1274,39 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB9_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
 ; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
@@ -972,6 +1322,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
 ; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    str wzr, [x0]
@@ -996,6 +1347,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
 ; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -1016,23 +1368,25 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
 ; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
 ; NONEON-NOSVE-NEXT:    str xzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
 ; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
 ; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
@@ -1061,13 +1415,21 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
@@ -1076,6 +1438,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
 ; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    str xzr, [x0]
@@ -1088,6 +1451,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
 ; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index d7eaf766e7df7c..2439cd29b8017e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -18,11 +18,22 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [x1, #2]
+; NONEON-NOSVE-NEXT:    ldrb w13, [x0]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w14, [x1, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    add w8, w11, w14
+; NONEON-NOSVE-NEXT:    add w9, w13, w9
+; NONEON-NOSVE-NEXT:    strb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strb w9, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
@@ -42,10 +53,46 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -65,10 +112,77 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
@@ -89,11 +203,143 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -116,17 +362,12 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
 ; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    add x9, x1, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    mov w8, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    strh w9, [x0]
-; NONEON-NOSVE-NEXT:    strh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [x1, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    strh w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w9, [x0, #2]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
@@ -146,10 +387,30 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
@@ -169,10 +430,45 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -193,11 +489,79 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -218,8 +582,18 @@ define void @abs_v2i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
@@ -239,8 +613,25 @@ define void @abs_v4i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
@@ -260,10 +651,40 @@ define void @abs_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
@@ -283,8 +704,18 @@ define void @abs_v2i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
@@ -304,10 +735,26 @@ define void @abs_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
@@ -328,13 +775,32 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [x1]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
@@ -355,13 +821,42 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
@@ -382,17 +877,69 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
@@ -415,25 +962,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -454,10 +1103,20 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
@@ -478,10 +1137,25 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
@@ -504,11 +1178,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -529,10 +1231,19 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
@@ -555,11 +1266,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index f595a4219cac9f..da89ba6942016c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -19,10 +19,70 @@ define void @test_revbv16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
@@ -43,10 +103,70 @@ define void @test_revbv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
@@ -67,10 +187,70 @@ define void @test_revbv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
@@ -91,10 +271,34 @@ define void @test_revhv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -115,10 +319,34 @@ define void @test_revhv8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -139,10 +367,34 @@ define void @test_revhv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -163,10 +415,22 @@ define void @test_revwv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -187,10 +451,22 @@ define void @test_revwv4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -210,7 +486,42 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: test_revv16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
@@ -230,10 +541,22 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
@@ -258,14 +581,58 @@ define void @test_revhv32i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    rev64 v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    rev64 v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
@@ -285,10 +652,18 @@ define void @test_rev_elts_fail(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_rev_elts_fail:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -358,12 +733,23 @@ define void @test_revv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index df786933da88cb..f1ceb9cdfada2c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -72,14 +72,82 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
@@ -212,24 +280,149 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q5, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q6, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v17.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip2 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip1 v16.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip2 v1.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip1 v2.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip1 v3.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    zip2 v5.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip2 v4.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v6.8h, v16.8h, v17.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    stp q6, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    stp q6, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
@@ -282,14 +475,50 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
@@ -326,14 +555,26 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
@@ -360,15 +601,28 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
@@ -405,12 +659,29 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
@@ -436,12 +707,22 @@ define void @zip1_v8i32_undef(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -465,15 +746,131 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn2 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    trn2 v2.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -500,15 +897,32 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v1.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v1.16b }, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w10, w9, w8
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w11, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
@@ -535,15 +949,79 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    trn2 v2.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
@@ -570,15 +1048,25 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn1 v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    trn2 v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
@@ -606,15 +1094,25 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
@@ -639,12 +1137,23 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
@@ -670,14 +1179,24 @@ define void @trn_v8i32_undef(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn1 v3.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -753,14 +1272,82 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
@@ -811,14 +1398,50 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
@@ -855,14 +1478,26 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
@@ -886,12 +1521,22 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -1097,15 +1742,131 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -1133,12 +1894,21 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; NONEON-NOSVE-LABEL: uzp_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    ext v2.8b, v0.8b, v0.8b, #2
-; NONEON-NOSVE-NEXT:    trn1 v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    zip1 v0.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    add v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
@@ -1260,15 +2030,79 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
@@ -1312,15 +2146,31 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp2 v2.4s, v3.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x8, #9205357640488583168 // =0x7fc000007fc00000
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov w8, #2143289344 // =0x7fc00000
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
@@ -1347,15 +2197,27 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
@@ -1427,12 +2289,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
@@ -1476,10 +2371,23 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
@@ -1507,15 +2415,28 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_vscale2_4:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 6b3c85f59357e7..0ef3bb04364e97 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -39,19 +39,76 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #255 // =0xff
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w9, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w10, w11, w10, hi
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
@@ -113,29 +170,144 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_or_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orn v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w10, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w12, w12, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w14, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w12, w14, w12
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w18, w18, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w0, w0, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w1, w1, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csinv w2, w2, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csinv w3, w3, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w4, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
@@ -207,29 +379,144 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_and_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w9, w9, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w10, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w11, w11, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w12, w12, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w14, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w12, w14, w12
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w13, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w15, w15, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csel w16, w16, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csel w17, w17, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w18, w18, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w0, w0, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w1, w1, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csel w2, w2, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w3, w3, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w4, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 0a7352bf49442d..b169188e423375 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -22,9 +22,26 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -41,7 +58,42 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -58,7 +110,74 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -76,10 +195,140 @@ define void @bitreverse_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
@@ -99,9 +348,17 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -118,8 +375,26 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -136,8 +411,42 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -155,12 +464,76 @@ define void @bitreverse_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
@@ -179,8 +552,15 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -197,8 +577,20 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -216,12 +608,32 @@ define void @bitreverse_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
@@ -240,8 +652,13 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -258,8 +675,15 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -277,12 +701,22 @@ define void @bitreverse_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
@@ -306,8 +740,31 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -324,7 +781,26 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -341,7 +817,42 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -359,10 +870,79 @@ define void @bswap_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
@@ -381,7 +961,26 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -398,7 +997,42 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -416,10 +1050,79 @@ define void @bswap_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
@@ -438,7 +1141,26 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -455,7 +1177,42 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -473,10 +1230,79 @@ define void @bswap_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index d86c7d36a1041e..546d4e80ea6909 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -18,15 +18,38 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #7
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #3
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp]
+; NONEON-NOSVE-NEXT:    sxtb w11, w8
+; NONEON-NOSVE-NEXT:    sxtb w13, w9
+; NONEON-NOSVE-NEXT:    sxtb w14, w10
+; NONEON-NOSVE-NEXT:    sxtb w15, w12
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w13, w13, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w14, w14, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    ubfx w11, w15, #10, #5
+; NONEON-NOSVE-NEXT:    add w9, w9, w13
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    add w10, w10, w14
+; NONEON-NOSVE-NEXT:    sxtb w9, w9
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    sxtb w10, w10
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    sxtb w11, w11
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #5
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
@@ -43,9 +66,58 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    usra v0.8b, v1.8b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.8b, v0.8b, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
@@ -62,9 +134,106 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v1.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
@@ -82,14 +251,204 @@ define void @sdiv_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v2.16b, #3
-; NONEON-NOSVE-NEXT:    usra v1.16b, v3.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
-; NONEON-NOSVE-NEXT:    sshr v1.16b, v1.16b, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
@@ -109,16 +468,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    ushr v1.2s, v1.2s, #26
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    sxth w10, w8
+; NONEON-NOSVE-NEXT:    sxth w11, w9
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #26, #5
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w11
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #5, #11
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #5, #11
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
@@ -135,9 +498,34 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
@@ -154,9 +542,58 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v1.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
@@ -174,14 +611,108 @@ define void @sdiv_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.8h, v1.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v2.8h, #11
-; NONEON-NOSVE-NEXT:    usra v1.8h, v3.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
-; NONEON-NOSVE-NEXT:    sshr v1.8h, v1.8h, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
@@ -200,9 +731,19 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    usra v0.2s, v1.2s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
@@ -219,9 +760,28 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v1.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
@@ -239,14 +799,48 @@ define void @sdiv_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #27
-; NONEON-NOSVE-NEXT:    usra v1.4s, v3.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
-; NONEON-NOSVE-NEXT:    sshr v1.4s, v1.4s, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
@@ -265,9 +859,15 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt d1, d0, #0
-; NONEON-NOSVE-NEXT:    usra d0, d1, #59
-; NONEON-NOSVE-NEXT:    sshr d0, d0, #5
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
@@ -285,9 +885,19 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v1.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
@@ -305,14 +915,30 @@ define void @sdiv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.2d, v1.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v2.2d, #59
-; NONEON-NOSVE-NEXT:    usra v1.2d, v3.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 6f82c97f3b872d..a631fb5533e24d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -18,9 +18,15 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 ;
 ; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
   %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -39,9 +45,25 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_without_splat:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -64,12 +86,40 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_legalization:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v4.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip1 v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip2 v3.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
                                                                              i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 6489e8d94d313d..89292bab88b57c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -19,7 +19,14 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -35,7 +42,18 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
@@ -51,7 +69,25 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -67,8 +103,27 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -85,7 +140,11 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
@@ -101,7 +160,14 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -117,7 +183,17 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -133,8 +209,19 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -151,7 +238,11 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -167,7 +258,11 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -183,8 +278,13 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -201,7 +301,11 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, x0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str x0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
@@ -217,7 +321,9 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -233,8 +339,11 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -256,8 +365,12 @@ define <2 x half> @splat_v2f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
@@ -274,8 +387,14 @@ define <4 x half> @splat_v4f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
@@ -292,8 +411,17 @@ define <8 x half> @splat_v8f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
@@ -310,9 +438,19 @@ define void @splat_v16f16(half %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
@@ -330,8 +468,11 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
@@ -348,8 +489,11 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
@@ -366,9 +510,13 @@ define void @splat_v8f32(float %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
@@ -383,6 +531,11 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
@@ -399,8 +552,9 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
@@ -417,9 +571,11 @@ define void @splat_v4f64(double %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
@@ -440,7 +596,8 @@ define void @splat_imm_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI24_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI24_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
@@ -458,8 +615,8 @@ define void @splat_imm_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #2 // =0x2
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI25_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI25_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
@@ -477,8 +634,8 @@ define void @splat_imm_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #3 // =0x3
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
@@ -496,8 +653,8 @@ define void @splat_imm_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #4 // =0x4
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
@@ -519,8 +676,8 @@ define void @splat_imm_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #17664 // =0x4500
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI28_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI28_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
@@ -538,7 +695,8 @@ define void @splat_imm_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #6.00000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI29_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI29_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
@@ -556,7 +714,8 @@ define void @splat_imm_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.2d, #7.00000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI30_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI30_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 41449aa90ba0a7..bfe73d46ad4e93 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -31,7 +31,8 @@ define void @store_v8i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
@@ -47,7 +48,8 @@ define void @store_v16i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
@@ -63,7 +65,8 @@ define void @store_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
@@ -96,7 +99,14 @@ define void @store_v2f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
@@ -111,7 +121,8 @@ define void @store_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI6_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
@@ -127,7 +138,8 @@ define void @store_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI7_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
@@ -143,7 +155,8 @@ define void @store_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
@@ -159,7 +172,8 @@ define void @store_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
@@ -175,7 +189,8 @@ define void @store_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
@@ -191,7 +206,8 @@ define void @store_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI11_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
@@ -263,7 +279,8 @@ define void @store_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
@@ -279,7 +296,8 @@ define void @store_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI17_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
@@ -295,8 +313,12 @@ define void @store_v1i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
@@ -311,8 +333,12 @@ define void @store_v1f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
@@ -355,7 +381,8 @@ define void @store_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI22_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI22_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
@@ -371,7 +398,8 @@ define void @store_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI23_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI23_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index d1873f43681504..1acc0e9fef15c9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -27,8 +27,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: subvector_v4i8:
 ; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index f0a4368da3ee17..97e20c686508f7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -17,8 +17,27 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
@@ -37,9 +56,15 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
@@ -58,8 +83,17 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
@@ -78,8 +112,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
@@ -99,10 +138,15 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ;
 ; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
-; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr x8, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [x0]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 4895ffb6858e47..43bfa37d07dd92 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -24,7 +24,41 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
@@ -51,13 +85,125 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #91]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #89]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #87]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #85]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w6, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    add w5, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #83]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #81]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #109]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #107]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #105]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #103]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #101]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #95]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #97]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
@@ -97,20 +243,276 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #448
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #416] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v6.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #230]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #270]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #266]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #262]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #258]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #254]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #250]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #242]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #174]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #220]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #170]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #166]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #162]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #335]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #333]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #331]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #329]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #327]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #325]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #323]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #321]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #303]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #350]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #349]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #347]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #346]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #345]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #343]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #342]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #341]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #339]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #338]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #337]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #448
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
@@ -172,34 +574,598 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v7.16b, v6.16b
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #408] // 8-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v16.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uzp1 v5.16b, v17.16b, v5.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v4.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.16b, v18.16b, v7.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v17.16b, v16.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.16b, v5.16b, v5.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v7.16b, v7.16b
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #606]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #600]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #598]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #596]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #594]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w8, [sp, #404] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #400] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #544]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #392] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #638]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #636]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #634]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #666]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #630]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #384] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #628]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #626]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #622]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #618]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #614]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #376] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #610]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #430]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #422]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #368] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #418]
+; NONEON-NOSVE-NEXT:    strb w30, [sp, #767]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #494]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #658]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #662]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #668]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #252] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #244] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #530]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #236] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #534]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #228] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #538]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #220] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #542]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #212] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #502]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #196] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #506]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #188] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #510]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #180] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #172] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #156] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #148] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #642]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #644]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #140] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #646]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #650]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #652]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #124] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #654]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #116] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #578]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #580]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #108] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #582]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #100] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #586]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #92] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #590]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #84] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #76] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #68] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #602]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #604]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #765]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #764]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #763]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #762]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #761]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #760]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #759]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #758]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #757]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #756]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #755]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #754]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #753]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #752]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #751]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #750]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #749]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #748]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #747]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #746]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #745]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #743]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #742]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #741]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #740]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #739]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #738]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #737]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #766]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #734]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #733]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #732]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #731]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #730]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #729]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #727]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #726]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #725]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #724]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #723]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #722]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #721]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #783]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #782]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #781]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #780]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #779]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #778]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #777]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #776]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #775]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #774]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #773]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #772]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #771]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #770]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #769]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #719]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #718]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #717]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #716]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #715]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #172] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #714]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #713]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #180] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #711]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #710]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #709]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #196] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #708]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #707]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #706]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #705]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #212] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #704]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #799]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #220] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #798]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #797]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #228] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #232] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #795]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #236] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #794]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #793]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #244] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #792]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #248] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #791]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #252] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #790]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #789]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #788]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #787]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #786]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #785]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #280] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #768]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #687]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #686]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #685]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #292] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #683]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #681]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #312] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #679]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #678]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #675]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #674]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #673]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #340] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #344] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #703]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #348] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #702]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #701]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #356] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #700]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #699]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #698]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #697]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #695]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #694]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #693]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #692]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #691]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #690]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #689]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #408] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #672]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x8, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
@@ -227,8 +1193,21 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
@@ -256,11 +1235,38 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
@@ -302,19 +1308,113 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #155]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #153]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #151]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #149]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #147]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #145]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #175]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #173]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #171]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #169]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #167]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #165]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #163]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #159]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #157]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #161]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
@@ -383,32 +1483,273 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v16.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v19.8h, v18.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v4.16b, v6.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v1.16b, v7.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w20, w8, w8
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #312]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #396]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #392]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #384]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #340]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #376]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #463]
+; NONEON-NOSVE-NEXT:    add w20, w22, w22
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w29, w28, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp w8, w30, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #461]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #459]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #457]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #455]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #453]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #451]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #449]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #447]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #445]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #443]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #441]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #439]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #437]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #435]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #433]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #431]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #429]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #427]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #423]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #421]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #419]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #417]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #477]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #475]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #473]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #471]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
@@ -435,7 +1776,21 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
@@ -462,13 +1817,54 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w4, w5, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
@@ -508,20 +1904,115 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #224] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v6.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #190]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #188]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
@@ -583,34 +2074,276 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    mov x5, x1
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v16.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v4.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v18.8h, v7.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v17.8h, v16.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.8h, v5.8h, v5.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v7.8h, v7.8h
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #376]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #296]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #344]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #304]
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #494]
+; NONEON-NOSVE-NEXT:    add w21, w23, w23
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #492]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp w0, w18, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp w2, w1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #510]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #506]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #502]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #432]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #496]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #410]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #406]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #402]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #400]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x5]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x5, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x5, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x5, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
@@ -639,8 +2372,15 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
@@ -669,12 +2409,27 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
@@ -717,17 +2472,47 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q7, q5, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #143]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #141]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #139]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
@@ -798,31 +2583,139 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v16.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v21.4s, v20.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v4.8h, v16.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v2.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #416
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #216]
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #232]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    str q18, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #280]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #160]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #303]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #416
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
@@ -850,8 +2743,15 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
@@ -879,11 +2779,27 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
@@ -925,19 +2841,66 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
@@ -1006,32 +2969,140 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v16.4s, v7.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #432
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #248]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #208]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #152]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #272]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #432
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
@@ -1058,7 +3129,13 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
@@ -1085,13 +3162,34 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
@@ -1131,20 +3229,60 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v6.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
@@ -1206,34 +3344,145 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #464] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v16.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v18.4s, v7.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.4s, v5.4s, v5.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v7.4s, v7.4s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #480] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #168]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #152]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w9, [sp, #344]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #336]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #248]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #264]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #376]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    str w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    str w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    str w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w5, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w6, [sp, #360]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    str w5, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w6, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #480] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #464] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index dd308dfadd80c8..ee50c1f4d41962 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -18,8 +18,17 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    trn1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
@@ -38,7 +47,19 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
@@ -57,7 +78,20 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -80,11 +114,35 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -107,7 +165,12 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
@@ -126,7 +189,17 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
@@ -145,7 +218,18 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
@@ -167,11 +251,31 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -194,7 +298,13 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
@@ -213,7 +323,16 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
@@ -235,11 +354,26 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -261,7 +395,12 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
@@ -283,11 +422,20 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x10, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -309,7 +457,17 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
@@ -327,7 +485,18 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
@@ -347,11 +516,31 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -373,7 +562,13 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
@@ -391,7 +586,16 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
@@ -411,11 +615,26 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -436,7 +655,12 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
@@ -456,11 +680,20 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -483,11 +716,21 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 42f3f03a5ea058..2d8b881d3fd1ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -43,7 +43,8 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ;
 ; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
 ; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
@@ -57,7 +58,8 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ;
 ; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }



More information about the llvm-commits mailing list