[llvm] 5f7502b - [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (#111698)

via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 18 02:19:26 PDT 2024


Author: Benjamin Maxwell
Date: 2024-10-18T10:19:22+01:00
New Revision: 5f7502bf1f193482e23385cdd4cfecf09f19ccbc

URL: https://github.com/llvm/llvm-project/commit/5f7502bf1f193482e23385cdd4cfecf09f19ccbc
DIFF: https://github.com/llvm/llvm-project/commit/5f7502bf1f193482e23385cdd4cfecf09f19ccbc.diff

LOG: [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (#111698)

This allows lowering fixed-length (non-constant) BUILD_VECTORS (<=
128-bit) to a chain of ZIP1 instructions when Neon is not available,
rather than using the default lowering, which is to spill to the stack
and reload.

For example,

```
t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3)
```

Becomes:

```
zip1 z0.s, z0.s, z1.s // z0 = t0,t1,...
zip1 z2.s, z2.s, z3.s // z2 = t2,t3,...
zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,...
```

When values are already in FRPs, this generally seems to lead to a more
compact output with less movement to/from the stack.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a44a73eb2c0fda..d5466e0a1cbd44 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -2111,7 +2112,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
   setOperationAction(ISD::BITREVERSE, VT, Default);
   setOperationAction(ISD::BSWAP, VT, Default);
-  setOperationAction(ISD::BUILD_VECTOR, VT, Default);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
   setOperationAction(ISD::CTLZ, VT, Default);
   setOperationAction(ISD::CTPOP, VT, Default);
@@ -14395,24 +14396,72 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  auto *BVN = cast<BuildVectorSDNode>(Op);
 
-  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
-    if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
-      SDLoc DL(Op);
-      EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-      SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
-      SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
-      SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
-      return convertFromScalableVector(DAG, Op.getValueType(), Seq);
-    }
+  if (auto SeqInfo = BVN->isConstantSequence()) {
+    SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
+    SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
+    SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
+    return convertFromScalableVector(DAG, VT, Seq);
+  }
+
+  unsigned NumElems = VT.getVectorNumElements();
+  if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
+      NumElems <= 1 || BVN->isConstant())
+    return SDValue();
+
+  auto IsExtractElt = [](SDValue Op) {
+    return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+  };
 
-    // Revert to common legalisation for all other variants.
+  // For integer types that are not already in vectors limit to at most four
+  // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
+  if (VT.getScalarType().isInteger() &&
+      NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
     return SDValue();
+
+  // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
+  SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
+  SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
+      Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
+        return Op.isUndef() ? Undef
+                            : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+                                          ContainerVT, Undef, Op, ZeroI64);
+      });
+
+  ElementCount ZipEC = ContainerVT.getVectorElementCount();
+  while (Intermediates.size() > 1) {
+    EVT ZipVT = getPackedSVEVectorVT(ZipEC);
+
+    for (unsigned I = 0; I < Intermediates.size(); I += 2) {
+      SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
+      SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
+      Intermediates[I / 2] =
+          Op1.isUndef() ? Op0
+                        : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+    }
+
+    Intermediates.resize(Intermediates.size() / 2);
+    ZipEC = ZipEC.divideCoefficientBy(2);
   }
 
+  assert(Intermediates.size() == 1);
+  SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
+  return convertFromScalableVector(DAG, VT, Vec);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+    return LowerFixedLengthBuildVectorToSVE(Op, DAG);
+
   // Try to build a simple constant vector.
   Op = NormalizeBuildVector(Op, DAG);
   // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 217e971568a999..160cd18ca53b32 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1244,6 +1244,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
                                               SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 276f23703df3df..20659cde83ee00 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s3
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s0
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s3
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s0
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s3
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s0
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -263,89 +230,59 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s0
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s1
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s0
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s1
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s0
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s1
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -401,34 +338,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
 define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
 ; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    mov z1.b, z0.b[7]
-; CHECK-NEXT:    mov z2.b, z0.b[6]
-; CHECK-NEXT:    mov z3.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strb w9, [sp, #11]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w9, [sp, #9]
-; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z4.b, z0.b[1]
+; CHECK-NEXT:    mov z1.b, z1.b[1]
+; CHECK-NEXT:    mov z5.b, z0.b[7]
+; CHECK-NEXT:    mov z6.b, z0.b[6]
+; CHECK-NEXT:    mov z0.b, z0.b[4]
+; CHECK-NEXT:    zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index 617b560713c3ab..478072d33d8c9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-LABEL: vls_sve_and_2xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    stp wzr, w8, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index b9264ad5f77c37..6644be11a02ba7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -91,19 +91,12 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: bitcast_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x1]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index b8a2e0e0f4bd4c..9729a1d95cd916 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -222,3 +222,255 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
+
+define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, w1, lsl #1
+; CHECK-NEXT:    orr w8, w8, w2, lsl #2
+; CHECK-NEXT:    orr w8, w8, w3, lsl #3
+; CHECK-NEXT:    strb w8, [x4]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr w8, w0, w1, lsl #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w2, lsl #2
+; NONEON-NOSVE-NEXT:    orr w8, w8, w3, lsl #3
+; NONEON-NOSVE-NEXT:    strb w8, [x4]
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+  %2 = insertelement <4 x i1>    %1, i1 %b, i64 1
+  %3 = insertelement <4 x i1>    %2, i1 %c, i64 2
+  %4 = insertelement <4 x i1>    %3, i1 %d, i64 3
+  store <4 x i1> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x double> undef, double %a, i64 0
+  %2 = insertelement <2 x double>    %1, double %b, i64 1
+  store <2 x double> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x float> undef, float %a, i64 0
+  %2 = insertelement <2 x float>    %1, float %b, i64 1
+  store <2 x float> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out)  {
+; CHECK-LABEL: build_vector_non_const_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    // kill: def $s3 killed $s3 def $z3
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s2, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x float> undef, float %a, i64 0
+  %2 = insertelement <4 x float>    %1, float %b, i64 1
+  %3 = insertelement <4 x float>    %2, float %c, i64 2
+  %4 = insertelement <4 x float>    %3, float %d, i64 3
+  store <4 x float> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out)  {
+; CHECK-LABEL: build_vector_non_const_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $z3
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d2, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x double> undef, double %a, i64 0
+  %2 = insertelement <4 x double>    %1, double %b, i64 1
+  %3 = insertelement <4 x double>    %2, double %c, i64 2
+  %4 = insertelement <4 x double>    %3, double %d, i64 3
+  store <4 x double> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h6 killed $h6 def $z6
+; CHECK-NEXT:    // kill: def $h4 killed $h4 def $z4
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    // kill: def $h7 killed $h7 def $z7
+; CHECK-NEXT:    // kill: def $h5 killed $h5 def $z5
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $z3
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $z1
+; CHECK-NEXT:    zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z1.s, z4.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h7, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h6, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h5, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h4, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half>    %1, half %b, i64 1
+  %3 = insertelement <8 x half>    %2, half %c, i64 2
+  %4 = insertelement <8 x half>    %3, half %d, i64 3
+  %5 = insertelement <8 x half>    %4, half %e, i64 4
+  %6 = insertelement <8 x half>    %5, half %f, i64 5
+  %7 = insertelement <8 x half>    %6, half %g, i64 6
+  %8 = insertelement <8 x half>    %7, half %h, i64 7
+  store <8 x half> %8, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    str d0, [x2]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x i32> undef, i32 %a, i64 0
+  %2 = insertelement <2 x i32>    %1, i32 %b, i64 1
+  store <2 x i32> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    strb w7, [sp, #15]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    strb w6, [sp, #14]
+; CHECK-NEXT:    strb w5, [sp, #13]
+; CHECK-NEXT:    strb w4, [sp, #12]
+; CHECK-NEXT:    strb w3, [sp, #11]
+; CHECK-NEXT:    strb w2, [sp, #10]
+; CHECK-NEXT:    strb w1, [sp, #9]
+; CHECK-NEXT:    strb w0, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <8 x i8> undef, i8 %a, i64 0
+  %2 = insertelement <8 x i8>    %1, i8 %b, i64 1
+  %3 = insertelement <8 x i8>    %2, i8 %c, i64 2
+  %4 = insertelement <8 x i8>    %3, i8 %d, i64 3
+  %5 = insertelement <8 x i8>    %4, i8 %e, i64 4
+  %6 = insertelement <8 x i8>    %5, i8 %f, i64 5
+  %7 = insertelement <8 x i8>    %6, i8 %g, i64 6
+  %8 = insertelement <8 x i8>    %7, i8 %h, i64 7
+  store <8 x i8> %8, ptr %out
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 4b6285b2732fe5..c1810c678ea522 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-LABEL: concat_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    mov z2.h, z1.h[3]
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z2.h, z1.h[3]
 ; CHECK-NEXT:    mov z3.h, z1.h[2]
-; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z0.h[3]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z7.h, z0.h[1]
+; CHECK-NEXT:    zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z7.b
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i8:
@@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-LABEL: concat_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z2.s, z1.s[1]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i16:
@@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-LABEL: concat_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    str h1, [sp, #12]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h2, [sp, #14]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 50a05cb4b1e277..7d6336a43a4fd1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-LABEL: load_sext_v2i64i256:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    asr x9, x8, #63
-; CHECK-NEXT:    fmov x10, d1
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    asr x8, x10, #63
-; CHECK-NEXT:    mov z0.d, x9
-; CHECK-NEXT:    stp x10, x8, [sp, #16]
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ldp q2, q4, [sp], #32
-; CHECK-NEXT:    mov z3.d, z0.d[1]
-; CHECK-NEXT:    mov z5.d, z1.d[1]
-; CHECK-NEXT:    mov z6.d, z2.d[1]
-; CHECK-NEXT:    fmov x2, d0
-; CHECK-NEXT:    mov z0.d, z4.d[1]
-; CHECK-NEXT:    fmov x6, d1
-; CHECK-NEXT:    fmov x0, d2
-; CHECK-NEXT:    fmov x4, d4
-; CHECK-NEXT:    fmov x3, d3
-; CHECK-NEXT:    fmov x7, d5
-; CHECK-NEXT:    fmov x1, d6
-; CHECK-NEXT:    fmov x5, d0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    asr x8, x8, #63
+; CHECK-NEXT:    fmov d3, x8
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, x9
+; CHECK-NEXT:    fmov x2, d2
+; CHECK-NEXT:    zip1 z1.d, z1.d, z4.d
+; CHECK-NEXT:    mov z4.d, z2.d[1]
+; CHECK-NEXT:    mov z5.d, z0.d[1]
+; CHECK-NEXT:    mov z6.d, z3.d[1]
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    fmov x6, d3
+; CHECK-NEXT:    mov z2.d, z1.d[1]
+; CHECK-NEXT:    fmov x3, d4
+; CHECK-NEXT:    fmov x1, d5
+; CHECK-NEXT:    fmov x4, d1
+; CHECK-NEXT:    fmov x7, d6
+; CHECK-NEXT:    fmov x5, d2
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v2i64i256:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 2665696308463f..a728cbe97056db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-LABEL: extract_subvector_v8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
@@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: extract_subvector_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i8:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index dad53b31db0b0f..f1771a753826cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f16_v4f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    sub sp, sp, #16
-; SVE-NEXT:    .cfi_def_cfa_offset 16
-; SVE-NEXT:    ldp q1, q0, [x1]
-; SVE-NEXT:    ldr d4, [x0]
-; SVE-NEXT:    and z4.h, z4.h, #0x7fff
-; SVE-NEXT:    mov z2.d, z0.d[1]
-; SVE-NEXT:    mov z3.d, z1.d[1]
-; SVE-NEXT:    fcvt h0, d0
+; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    mov z2.d, z1.d[1]
+; SVE-NEXT:    mov z3.d, z0.d[1]
 ; SVE-NEXT:    fcvt h1, d1
+; SVE-NEXT:    fcvt h0, d0
 ; SVE-NEXT:    fcvt h2, d2
 ; SVE-NEXT:    fcvt h3, d3
-; SVE-NEXT:    str h0, [sp, #12]
-; SVE-NEXT:    str h1, [sp, #8]
-; SVE-NEXT:    str h2, [sp, #14]
-; SVE-NEXT:    str h3, [sp, #10]
-; SVE-NEXT:    ldr d0, [sp, #8]
+; SVE-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
-; SVE-NEXT:    orr z0.d, z4.d, z0.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str d0, [x0]
-; SVE-NEXT:    add sp, sp, #16
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f16_v4f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    sub sp, sp, #16
-; SVE2-NEXT:    .cfi_def_cfa_offset 16
-; SVE2-NEXT:    ldp q2, q1, [x1]
-; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT:    ldr d5, [x0]
-; SVE2-NEXT:    mov z3.d, z1.d[1]
-; SVE2-NEXT:    mov z4.d, z2.d[1]
+; SVE2-NEXT:    ldp q0, q1, [x1]
+; SVE2-NEXT:    mov z2.d, z1.d[1]
+; SVE2-NEXT:    mov z3.d, z0.d[1]
 ; SVE2-NEXT:    fcvt h1, d1
+; SVE2-NEXT:    fcvt h0, d0
 ; SVE2-NEXT:    fcvt h2, d2
 ; SVE2-NEXT:    fcvt h3, d3
-; SVE2-NEXT:    fcvt h4, d4
-; SVE2-NEXT:    str h1, [sp, #12]
-; SVE2-NEXT:    str h2, [sp, #8]
-; SVE2-NEXT:    str h3, [sp, #14]
-; SVE2-NEXT:    str h4, [sp, #10]
-; SVE2-NEXT:    ldr d1, [sp, #8]
-; SVE2-NEXT:    bsl z5.d, z5.d, z1.d, z0.d
-; SVE2-NEXT:    str d5, [x0]
-; SVE2-NEXT:    add sp, sp, #16
+; SVE2-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2-NEXT:    mov z2.h, #32767 // =0x7fff
+; SVE2-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE2-NEXT:    ldr d1, [x0]
+; SVE2-NEXT:    bsl z1.d, z1.d, z0.d, z2.d
+; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index a206fbc5102953..11fee267660c03 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
@@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    fcvtzu x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[2]
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x10, h0
+; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    fcvtzu x9, h2
+; CHECK-NEXT:    fcvtzu x11, h3
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h3
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x11, h1
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    fcvtzu x8, h2
 ; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    stp x12, x8, [sp, #32]
-; CHECK-NEXT:    stp x10, x9, [sp, #48]
-; CHECK-NEXT:    ldp q1, q0, [sp, #32]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    mov z6.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    fcvtzu x14, h1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    fcvtzu x12, h5
+; CHECK-NEXT:    fcvtzu x13, h6
+; CHECK-NEXT:    fcvtzu x15, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d4, x13
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fmov d3, x14
+; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT:    fmov d4, x15
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT:    stp q3, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    mov z5.d, z1.d
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z6.h, z1.h[3]
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    mov z7.h, z0.h[1]
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    mov z4.h, z2.h[1]
-; CHECK-NEXT:    fcvtzu x8, h2
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[2]
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    fcvtzu x9, h4
-; CHECK-NEXT:    mov z4.h, z3.h[1]
-; CHECK-NEXT:    fcvtzu x10, h5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    fcvtzu x11, h2
-; CHECK-NEXT:    mov z2.h, z3.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    fcvtzu x8, h4
-; CHECK-NEXT:    fcvtzu x9, h5
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    fcvtzu x12, h6
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    stp x12, x8, [sp, #64]
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    fcvtzu x8, h4
-; CHECK-NEXT:    stp x10, x9, [sp, #80]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x12, [sp, #32]
-; CHECK-NEXT:    fcvtzu x11, h2
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    stp x9, x8, [sp, #48]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp]
-; CHECK-NEXT:    ldp q3, q4, [sp, #64]
-; CHECK-NEXT:    stp x10, x11, [sp, #96]
-; CHECK-NEXT:    ldp q6, q7, [sp, #32]
-; CHECK-NEXT:    stp x8, x12, [sp, #112]
-; CHECK-NEXT:    ldp q5, q2, [sp, #96]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #96]
-; CHECK-NEXT:    stp q5, q2, [x1, #64]
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    fmov d16, x9
+; CHECK-NEXT:    mov z2.h, z3.h[3]
+; CHECK-NEXT:    mov z4.h, z5.h[3]
+; CHECK-NEXT:    fcvtzu x14, h3
+; CHECK-NEXT:    fcvtzu x13, h1
+; CHECK-NEXT:    fcvtzu x15, h5
+; CHECK-NEXT:    mov z1.h, z3.h[1]
+; CHECK-NEXT:    mov z6.h, z5.h[1]
+; CHECK-NEXT:    mov z5.h, z5.h[2]
+; CHECK-NEXT:    mov z3.h, z3.h[2]
+; CHECK-NEXT:    fcvtzu x9, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fcvtzu x10, h4
+; CHECK-NEXT:    fmov d4, x11
+; CHECK-NEXT:    fcvtzu x11, h7
+; CHECK-NEXT:    fmov d7, x12
+; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    fcvtzu x13, h1
+; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    fcvtzu x14, h6
+; CHECK-NEXT:    fmov d6, x15
+; CHECK-NEXT:    fcvtzu x15, h5
+; CHECK-NEXT:    fmov d5, x9
+; CHECK-NEXT:    fcvtzu x9, h3
+; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT:    fmov d16, x8
+; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT:    fmov d3, x12
+; CHECK-NEXT:    fmov d7, x10
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT:    fmov d16, x15
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    fmov d2, x13
+; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    stp q0, q7, [x1, #96]
+; CHECK-NEXT:    stp q1, q4, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    mov z2.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z1.s[1]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
@@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z3.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
@@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q5, q6, [x0, #96]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    ldp q0, q4, [x0, #32]
+; CHECK-NEXT:    ldp q2, q7, [x0, #64]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z16.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z3.s[1]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT:    mov z3.s, z5.s[1]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    mov z17.s, z6.s[1]
+; CHECK-NEXT:    mov z16.s, z4.s[1]
+; CHECK-NEXT:    mov z18.s, z5.s[1]
+; CHECK-NEXT:    mov z21.s, z0.s[1]
+; CHECK-NEXT:    mov z19.s, z7.s[1]
+; CHECK-NEXT:    mov z20.s, z2.s[1]
+; CHECK-NEXT:    mov z22.s, z3.s[1]
+; CHECK-NEXT:    mov z23.s, z1.s[1]
+; CHECK-NEXT:    zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT:    zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT:    zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
@@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
@@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    fcvtzs x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[2]
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x10, h0
+; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fcvtzs x11, h3
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h3
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x11, h1
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    fcvtzs x8, h2
 ; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    stp x12, x8, [sp, #32]
-; CHECK-NEXT:    stp x10, x9, [sp, #48]
-; CHECK-NEXT:    ldp q1, q0, [sp, #32]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    mov z6.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    fcvtzs x14, h1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    fcvtzs x12, h5
+; CHECK-NEXT:    fcvtzs x13, h6
+; CHECK-NEXT:    fcvtzs x15, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d4, x13
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fmov d3, x14
+; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT:    fmov d4, x15
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT:    stp q3, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    mov z5.d, z1.d
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z6.h, z1.h[3]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov z7.h, z0.h[1]
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    mov z4.h, z2.h[1]
-; CHECK-NEXT:    fcvtzs x8, h2
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[2]
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    fcvtzs x9, h4
-; CHECK-NEXT:    mov z4.h, z3.h[1]
-; CHECK-NEXT:    fcvtzs x10, h5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    fcvtzs x11, h2
-; CHECK-NEXT:    mov z2.h, z3.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    fcvtzs x8, h4
-; CHECK-NEXT:    fcvtzs x9, h5
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    fcvtzs x12, h6
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    stp x12, x8, [sp, #64]
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    fcvtzs x8, h4
-; CHECK-NEXT:    stp x10, x9, [sp, #80]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x12, [sp, #32]
-; CHECK-NEXT:    fcvtzs x11, h2
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    stp x9, x8, [sp, #48]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp]
-; CHECK-NEXT:    ldp q3, q4, [sp, #64]
-; CHECK-NEXT:    stp x10, x11, [sp, #96]
-; CHECK-NEXT:    ldp q6, q7, [sp, #32]
-; CHECK-NEXT:    stp x8, x12, [sp, #112]
-; CHECK-NEXT:    ldp q5, q2, [sp, #96]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #96]
-; CHECK-NEXT:    stp q5, q2, [x1, #64]
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    fmov d16, x9
+; CHECK-NEXT:    mov z2.h, z3.h[3]
+; CHECK-NEXT:    mov z4.h, z5.h[3]
+; CHECK-NEXT:    fcvtzs x14, h3
+; CHECK-NEXT:    fcvtzs x13, h1
+; CHECK-NEXT:    fcvtzs x15, h5
+; CHECK-NEXT:    mov z1.h, z3.h[1]
+; CHECK-NEXT:    mov z6.h, z5.h[1]
+; CHECK-NEXT:    mov z5.h, z5.h[2]
+; CHECK-NEXT:    mov z3.h, z3.h[2]
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    fmov d4, x11
+; CHECK-NEXT:    fcvtzs x11, h7
+; CHECK-NEXT:    fmov d7, x12
+; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    fcvtzs x13, h1
+; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    fcvtzs x14, h6
+; CHECK-NEXT:    fmov d6, x15
+; CHECK-NEXT:    fcvtzs x15, h5
+; CHECK-NEXT:    fmov d5, x9
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT:    fmov d16, x8
+; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT:    fmov d3, x12
+; CHECK-NEXT:    fmov d7, x10
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT:    fmov d16, x15
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    fmov d2, x13
+; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    stp q0, q7, [x1, #96]
+; CHECK-NEXT:    stp q1, q4, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    mov z2.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z1.s[1]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
@@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z3.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
@@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q5, q6, [x0, #96]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    ldp q0, q4, [x0, #32]
+; CHECK-NEXT:    ldp q2, q7, [x0, #64]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z16.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z3.s[1]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT:    mov z3.s, z5.s[1]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    mov z17.s, z6.s[1]
+; CHECK-NEXT:    mov z16.s, z4.s[1]
+; CHECK-NEXT:    mov z18.s, z5.s[1]
+; CHECK-NEXT:    mov z21.s, z0.s[1]
+; CHECK-NEXT:    mov z19.s, z7.s[1]
+; CHECK-NEXT:    mov z20.s, z2.s[1]
+; CHECK-NEXT:    mov z22.s, z3.s[1]
+; CHECK-NEXT:    mov z23.s, z1.s[1]
+; CHECK-NEXT:    zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT:    zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT:    zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 035c76b569298a..ad5f91a5f39a49 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -8,25 +8,18 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) {
 ; CHECK-LABEL: select_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    mov z3.s, z2.s[1]
-; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d2, [sp, #8]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
 ; CHECK-NEXT:    lsl z2.h, z2.h, #15
 ; CHECK-NEXT:    asr z2.h, z2.h, #15
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index d77473ed8f08e5..275d13ebfd9491 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -506,14 +506,10 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-LABEL: insertelement_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    fmov h1, #5.00000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 0c712a15d4de2f..e595686cb4975d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1140,18 +1140,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    ucvtf h1, x8
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
@@ -2598,18 +2594,14 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: scvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    scvtf h1, x8
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 270f05a806b82d..613543310f2c31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -10,25 +10,20 @@ declare void @def(ptr)
 define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #28
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    add x20, sp, #28
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x20, sp, #12
 ; CHECK-NEXT:    bl def
 ; CHECK-NEXT:    ptrue p0.b, vl2
 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    mov z2.b, z0.b[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: alloc_v4i8:
@@ -62,32 +57,28 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v6i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x0, sp, #8
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    add x8, sp, #4
 ; CHECK-NEXT:    ptrue p1.s, vl2
 ; CHECK-NEXT:    mov z1.b, z0.b[3]
-; CHECK-NEXT:    mov z2.b, z0.b[5]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    add x8, sp, #20
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    st1b { z0.h }, p0, [x8]
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x8]
-; CHECK-NEXT:    strb w9, [x19, #2]
+; CHECK-NEXT:    mov z2.b, z0.b[1]
+; CHECK-NEXT:    mov z0.b, z0.b[5]
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    st1b { z1.h }, p0, [x8]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x8]
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x19, #2]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    strh w8, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: alloc_v6i8:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 5f4b9dd1592cf2..9055b2efba3282 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1466,23 +1466,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    fmov s1, wzr
+; CHECK-NEXT:    mov z2.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f16:
@@ -2318,33 +2313,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z1.b, z0.b[3]
 ; CHECK-NEXT:    mov z2.b, z0.b[2]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    mov z3.b, z0.b[1]
 ; CHECK-NEXT:    mov z4.b, z0.b[7]
-; CHECK-NEXT:    strh w8, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[5]
-; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    ldp d0, d1, [sp]
+; CHECK-NEXT:    mov z5.b, z0.b[6]
+; CHECK-NEXT:    mov z6.b, z0.b[5]
+; CHECK-NEXT:    mov z7.b, z0.b[4]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z2.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    zip1 z1.s, z3.s, z2.s
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
@@ -2357,7 +2340,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f32:
@@ -2684,23 +2666,21 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-LABEL: masked_load_zext_v3i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    fmov s0, w2
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    strh w2, [sp, #10]
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    strh w1, [sp, #8]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    fmov s1, w3
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
@@ -2759,23 +2739,21 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-LABEL: masked_load_sext_v3i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    fmov s0, w2
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    strh w2, [sp, #10]
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    strh w1, [sp, #8]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    fmov s1, w3
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 0c3411e5f55148..265480b571970f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -589,23 +589,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    fmov s1, wzr
+; CHECK-NEXT:    mov z2.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f16:
@@ -1014,48 +1009,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z4.b, z0.b[4]
+; CHECK-NEXT:    mov z5.b, z0.b[3]
+; CHECK-NEXT:    mov z6.b, z0.b[2]
+; CHECK-NEXT:    mov z7.b, z0.b[1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z4.b, z0.b[1]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z7.h
+; CHECK-NEXT:    zip1 z1.s, z2.s, z1.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
-; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr d0, [sp]
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f32:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index b91f813c5141bb..8b296d9fbc215d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    mov z3.b, z0.b[14]
-; CHECK-NEXT:    mov z4.b, z0.b[13]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.b, z0.b[11]
-; CHECK-NEXT:    mov z2.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[10]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    mov z4.b, z0.b[14]
+; CHECK-NEXT:    mov z6.b, z0.b[13]
+; CHECK-NEXT:    mov z3.b, z1.b[15]
+; CHECK-NEXT:    mov z5.b, z1.b[14]
+; CHECK-NEXT:    mov z7.b, z1.b[13]
+; CHECK-NEXT:    mov z16.b, z0.b[12]
+; CHECK-NEXT:    mov z17.b, z1.b[12]
+; CHECK-NEXT:    mov z18.b, z0.b[11]
+; CHECK-NEXT:    mov z19.b, z1.b[11]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    mov z21.b, z1.b[10]
+; CHECK-NEXT:    mov z22.b, z0.b[9]
+; CHECK-NEXT:    mov z23.b, z1.b[9]
+; CHECK-NEXT:    mov z24.b, z0.b[8]
+; CHECK-NEXT:    mov z25.b, z1.b[8]
+; CHECK-NEXT:    zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT:    zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT:    zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT:    zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT:    zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT:    zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT:    zip1 z16.b, z22.b, z23.b
 ; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[9]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[8]
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z4.h, z7.h, z6.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v32i8:
@@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q1, q3, [x1]
-; CHECK-NEXT:    ldp q0, q4, [x0]
-; CHECK-NEXT:    ldp q2, q5, [x0, #32]
-; CHECK-NEXT:    mov z16.h, z3.h[7]
-; CHECK-NEXT:    mov z18.h, z3.h[6]
-; CHECK-NEXT:    mov z17.h, z4.h[7]
-; CHECK-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-NEXT:    mov z19.h, z4.h[6]
-; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    .cfi_offset b8, -8
+; CHECK-NEXT:    .cfi_offset b9, -16
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -40
+; CHECK-NEXT:    .cfi_offset b13, -48
+; CHECK-NEXT:    .cfi_offset b14, -56
+; CHECK-NEXT:    .cfi_offset b15, -64
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEXT:    mov z5.h, z1.h[7]
+; CHECK-NEXT:    mov z7.h, z1.h[6]
+; CHECK-NEXT:    mov z17.h, z1.h[5]
+; CHECK-NEXT:    mov z4.h, z3.h[7]
+; CHECK-NEXT:    mov z6.h, z3.h[6]
 ; CHECK-NEXT:    mov z16.h, z3.h[5]
-; CHECK-NEXT:    fmov w9, s17
-; CHECK-NEXT:    mov z17.h, z4.h[5]
-; CHECK-NEXT:    mov z20.h, z7.h[6]
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z20.h, z2.h[7]
+; CHECK-NEXT:    mov z21.h, z0.h[7]
 ; CHECK-NEXT:    mov z18.h, z3.h[4]
-; CHECK-NEXT:    strh w9, [sp, #28]
-; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    mov z19.h, z5.h[7]
-; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z4.h[4]
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    zip1 z4.h, z5.h, z7.h
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z1.h[7]
-; CHECK-NEXT:    add z3.h, z3.h, z4.h
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[7]
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z1.h[6]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z1.h[4]
-; CHECK-NEXT:    strh w8, [sp, #56]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[4]
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z1.h, z2.h, z6.h
-; CHECK-NEXT:    strh w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    ldr q16, [sp, #16]
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strh w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z7.h[7]
-; CHECK-NEXT:    strh w8, [sp, #48]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z5.h[6]
-; CHECK-NEXT:    ldr q17, [sp, #48]
-; CHECK-NEXT:    strh w8, [sp, #46]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z7.h[5]
-; CHECK-NEXT:    strh w8, [sp, #44]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z5.h[5]
-; CHECK-NEXT:    strh w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z7.h[4]
-; CHECK-NEXT:    strh w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z5.h[4]
-; CHECK-NEXT:    strh w8, [sp, #38]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z6.h[7]
-; CHECK-NEXT:    strh w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z2.h[7]
-; CHECK-NEXT:    strh w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z6.h[6]
-; CHECK-NEXT:    strh w8, [sp, #32]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z2.h[6]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z6.h[5]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z2.h[5]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z6.h[4]
-; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z2.h[4]
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    ldr q2, [sp, #32]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    add z2.h, z16.h, z2.h
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    ldr q4, [sp]
-; CHECK-NEXT:    stp q3, q2, [x0, #32]
-; CHECK-NEXT:    add z1.h, z17.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov z19.h, z1.h[4]
+; CHECK-NEXT:    mov z22.h, z2.h[6]
+; CHECK-NEXT:    mov z23.h, z0.h[6]
+; CHECK-NEXT:    zip1 z24.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z25.h, z7.h, z6.h
+; CHECK-NEXT:    zip1 z17.h, z17.h, z16.h
+; CHECK-NEXT:    ldp q4, q6, [x0, #32]
+; CHECK-NEXT:    zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT:    ldp q5, q7, [x1, #32]
+; CHECK-NEXT:    zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z19.s, z25.s, z24.s
+; CHECK-NEXT:    zip1 z22.h, z23.h, z22.h
+; CHECK-NEXT:    mov z23.h, z2.h[5]
+; CHECK-NEXT:    mov z21.h, z6.h[7]
+; CHECK-NEXT:    mov z24.h, z0.h[5]
+; CHECK-NEXT:    mov z25.h, z2.h[4]
+; CHECK-NEXT:    mov z20.h, z7.h[7]
+; CHECK-NEXT:    mov z26.h, z0.h[4]
+; CHECK-NEXT:    mov z27.h, z6.h[6]
+; CHECK-NEXT:    mov z28.h, z7.h[5]
+; CHECK-NEXT:    mov z29.h, z6.h[5]
+; CHECK-NEXT:    mov z30.h, z7.h[4]
+; CHECK-NEXT:    mov z31.h, z6.h[4]
+; CHECK-NEXT:    mov z8.h, z5.h[7]
+; CHECK-NEXT:    mov z9.h, z4.h[7]
+; CHECK-NEXT:    zip1 z20.h, z21.h, z20.h
+; CHECK-NEXT:    mov z21.h, z7.h[6]
+; CHECK-NEXT:    mov z10.h, z5.h[6]
+; CHECK-NEXT:    mov z11.h, z4.h[6]
+; CHECK-NEXT:    mov z12.h, z5.h[5]
+; CHECK-NEXT:    mov z13.h, z4.h[5]
+; CHECK-NEXT:    mov z14.h, z5.h[4]
+; CHECK-NEXT:    mov z15.h, z4.h[4]
+; CHECK-NEXT:    zip1 z23.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z21.h, z27.h, z21.h
+; CHECK-NEXT:    zip1 z27.h, z29.h, z28.h
+; CHECK-NEXT:    zip1 z28.h, z31.h, z30.h
+; CHECK-NEXT:    zip1 z24.h, z26.h, z25.h
+; CHECK-NEXT:    zip1 z25.h, z9.h, z8.h
+; CHECK-NEXT:    zip1 z26.h, z11.h, z10.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z29.h, z13.h, z12.h
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z30.h, z15.h, z14.h
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z17.s, z18.s, z17.s
+; CHECK-NEXT:    zip1 z18.s, z21.s, z20.s
+; CHECK-NEXT:    zip1 z20.s, z28.s, z27.s
+; CHECK-NEXT:    zip1 z16.s, z22.s, z16.s
+; CHECK-NEXT:    zip1 z21.s, z24.s, z23.s
+; CHECK-NEXT:    zip1 z1.h, z1.h, z3.h
+; CHECK-NEXT:    zip1 z3.s, z26.s, z25.s
+; CHECK-NEXT:    zip1 z22.s, z30.s, z29.s
+; CHECK-NEXT:    zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z7.d, z17.d, z19.d
+; CHECK-NEXT:    zip1 z17.d, z20.d, z18.d
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z2.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.d, z21.d, z16.d
+; CHECK-NEXT:    zip1 z3.d, z22.d, z3.d
+; CHECK-NEXT:    add z1.h, z1.h, z6.h
+; CHECK-NEXT:    add z5.h, z7.h, z17.h
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-NEXT:    add z2.h, z4.h, z3.h
+; CHECK-NEXT:    stp q1, q5, [x0, #32]
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_v32i16:
@@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    mov z3.h, z0.h[6]
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov z4.h, z0.h[6]
+; CHECK-NEXT:    mov z6.h, z0.h[5]
 ; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z7.h, z1.h[5]
+; CHECK-NEXT:    mov z16.h, z0.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[4]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z17.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v16i16:
@@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
@@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z4.s, z5.s
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32:
@@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w9, w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w9, w8, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_v4i32:
@@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) {
 define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    mov z3.b, z0.b[14]
-; CHECK-NEXT:    mov z4.b, z0.b[13]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.b, z0.b[11]
-; CHECK-NEXT:    mov z2.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[10]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    mov z4.b, z0.b[14]
+; CHECK-NEXT:    mov z6.b, z0.b[13]
+; CHECK-NEXT:    mov z3.b, z1.b[15]
+; CHECK-NEXT:    mov z5.b, z1.b[14]
+; CHECK-NEXT:    mov z7.b, z1.b[13]
+; CHECK-NEXT:    mov z16.b, z0.b[12]
+; CHECK-NEXT:    mov z17.b, z1.b[12]
+; CHECK-NEXT:    mov z18.b, z0.b[11]
+; CHECK-NEXT:    mov z19.b, z1.b[11]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    mov z21.b, z1.b[10]
+; CHECK-NEXT:    mov z22.b, z0.b[9]
+; CHECK-NEXT:    mov z23.b, z1.b[9]
+; CHECK-NEXT:    mov z24.b, z0.b[8]
+; CHECK-NEXT:    mov z25.b, z1.b[8]
+; CHECK-NEXT:    zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT:    zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT:    zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT:    zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT:    zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT:    zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT:    zip1 z16.b, z22.b, z23.b
 ; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[9]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[8]
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z4.h, z7.h, z6.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v32i8:
@@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    mov z3.h, z0.h[6]
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov z4.h, z0.h[6]
+; CHECK-NEXT:    mov z6.h, z0.h[5]
 ; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z7.h, z1.h[5]
+; CHECK-NEXT:    mov z16.h, z0.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[4]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z17.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v16i16:
@@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
@@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z4.s, z5.s
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32:
@@ -1547,197 +1432,139 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov z4.b, z3.b[14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.b, z3.b[10]
-; CHECK-NEXT:    mov z5.b, z3.b[12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z7.b, z3.b[8]
-; CHECK-NEXT:    mov z17.b, z3.b[9]
-; CHECK-NEXT:    mov z18.b, z3.b[7]
-; CHECK-NEXT:    mov z16.b, z3.b[11]
-; CHECK-NEXT:    strb w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z3.b[6]
-; CHECK-NEXT:    strb w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.b, z3.b[4]
-; CHECK-NEXT:    strb w8, [sp, #47]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[2]
-; CHECK-NEXT:    strb w9, [sp, #46]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    mov z7.b, z2.b[14]
-; CHECK-NEXT:    strb w8, [sp, #45]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z2.b[12]
-; CHECK-NEXT:    strb w9, [sp, #44]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z16.b, z2.b[11]
-; CHECK-NEXT:    strb w8, [sp, #43]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z2.b[10]
-; CHECK-NEXT:    strb w9, [sp, #61]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    strb w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z2.b[8]
-; CHECK-NEXT:    strb w9, [sp, #53]
-; CHECK-NEXT:    strb w8, [sp, #41]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[6]
-; CHECK-NEXT:    strb w8, [sp, #39]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z2.b[4]
-; CHECK-NEXT:    strb w8, [sp, #38]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z2.b[2]
-; CHECK-NEXT:    strb w8, [sp, #37]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[8]
-; CHECK-NEXT:    strb w8, [sp, #35]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #33]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[6]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z1.b[4]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[2]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z0.b[14]
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z0.b[10]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[8]
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z0.b[6]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[4]
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z0.b[2]
-; CHECK-NEXT:    strb w8, [sp, #5]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z3.b[13]
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    ldr q4, [sp, #32]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[5]
-; CHECK-NEXT:    mov z3.b, z3.b[3]
-; CHECK-NEXT:    ldr q5, [sp]
-; CHECK-NEXT:    strb w8, [sp, #63]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[13]
-; CHECK-NEXT:    strb w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strb w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    strb w8, [sp, #59]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z2.b[9]
-; CHECK-NEXT:    strb w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z2.b[5]
-; CHECK-NEXT:    strb w8, [sp, #57]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[3]
+; CHECK-NEXT:    stp d13, d12, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset b8, -8
+; CHECK-NEXT:    .cfi_offset b9, -16
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -40
+; CHECK-NEXT:    .cfi_offset b13, -48
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z2.b, z1.b[14]
+; CHECK-NEXT:    mov z3.b, z1.b[12]
+; CHECK-NEXT:    mov z4.b, z1.b[10]
+; CHECK-NEXT:    mov z5.b, z1.b[8]
+; CHECK-NEXT:    mov z6.b, z1.b[6]
+; CHECK-NEXT:    mov z7.b, z1.b[4]
+; CHECK-NEXT:    mov z16.b, z1.b[2]
+; CHECK-NEXT:    mov z18.b, z0.b[14]
+; CHECK-NEXT:    mov z19.b, z0.b[12]
+; CHECK-NEXT:    zip1 z3.b, z3.b, z2.b
+; CHECK-NEXT:    ldp q2, q17, [x1]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    zip1 z4.b, z5.b, z4.b
+; CHECK-NEXT:    zip1 z5.b, z7.b, z6.b
+; CHECK-NEXT:    zip1 z6.b, z1.b, z16.b
+; CHECK-NEXT:    mov z7.b, z0.b[8]
+; CHECK-NEXT:    mov z16.b, z0.b[6]
+; CHECK-NEXT:    mov z21.b, z0.b[4]
+; CHECK-NEXT:    mov z22.b, z0.b[2]
+; CHECK-NEXT:    mov z23.b, z17.b[14]
+; CHECK-NEXT:    mov z24.b, z17.b[12]
+; CHECK-NEXT:    mov z25.b, z17.b[10]
+; CHECK-NEXT:    mov z26.b, z17.b[8]
+; CHECK-NEXT:    mov z27.b, z17.b[6]
+; CHECK-NEXT:    mov z28.b, z17.b[4]
+; CHECK-NEXT:    mov z29.b, z17.b[2]
+; CHECK-NEXT:    zip1 z18.b, z19.b, z18.b
+; CHECK-NEXT:    zip1 z7.b, z7.b, z20.b
+; CHECK-NEXT:    zip1 z16.b, z21.b, z16.b
+; CHECK-NEXT:    zip1 z19.b, z0.b, z22.b
+; CHECK-NEXT:    zip1 z20.b, z24.b, z23.b
+; CHECK-NEXT:    zip1 z21.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z22.b, z28.b, z27.b
+; CHECK-NEXT:    mov z24.b, z2.b[14]
+; CHECK-NEXT:    mov z25.b, z2.b[12]
+; CHECK-NEXT:    mov z26.b, z2.b[10]
+; CHECK-NEXT:    mov z27.b, z2.b[8]
+; CHECK-NEXT:    zip1 z23.b, z17.b, z29.b
+; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z5.h, z7.h, z18.h
+; CHECK-NEXT:    zip1 z6.h, z19.h, z16.h
+; CHECK-NEXT:    zip1 z7.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z18.b, z25.b, z24.b
+; CHECK-NEXT:    zip1 z19.b, z27.b, z26.b
+; CHECK-NEXT:    mov z20.b, z2.b[6]
+; CHECK-NEXT:    mov z21.b, z2.b[4]
+; CHECK-NEXT:    mov z29.b, z17.b[3]
+; CHECK-NEXT:    mov z30.b, z17.b[1]
+; CHECK-NEXT:    mov z31.b, z2.b[15]
+; CHECK-NEXT:    mov z8.b, z2.b[13]
+; CHECK-NEXT:    zip1 z16.h, z23.h, z22.h
+; CHECK-NEXT:    mov z22.b, z2.b[2]
+; CHECK-NEXT:    mov z23.b, z17.b[15]
+; CHECK-NEXT:    mov z24.b, z17.b[13]
+; CHECK-NEXT:    mov z25.b, z17.b[11]
+; CHECK-NEXT:    mov z26.b, z17.b[9]
+; CHECK-NEXT:    mov z27.b, z17.b[7]
+; CHECK-NEXT:    mov z28.b, z17.b[5]
+; CHECK-NEXT:    zip1 z17.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z21.b, z21.b, z20.b
+; CHECK-NEXT:    zip1 z19.b, z30.b, z29.b
+; CHECK-NEXT:    zip1 z20.b, z8.b, z31.b
+; CHECK-NEXT:    mov z29.b, z1.b[15]
+; CHECK-NEXT:    mov z30.b, z1.b[13]
+; CHECK-NEXT:    mov z31.b, z1.b[11]
+; CHECK-NEXT:    mov z8.b, z1.b[9]
+; CHECK-NEXT:    zip1 z22.b, z2.b, z22.b
+; CHECK-NEXT:    zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT:    zip1 z24.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z18.b, z28.b, z27.b
+; CHECK-NEXT:    mov z25.b, z2.b[11]
+; CHECK-NEXT:    mov z26.b, z2.b[9]
+; CHECK-NEXT:    mov z27.b, z2.b[7]
+; CHECK-NEXT:    mov z28.b, z2.b[5]
+; CHECK-NEXT:    mov z9.b, z1.b[7]
+; CHECK-NEXT:    mov z10.b, z1.b[5]
+; CHECK-NEXT:    mov z1.b, z1.b[3]
+; CHECK-NEXT:    mov z11.b, z0.b[11]
+; CHECK-NEXT:    mov z12.b, z0.b[9]
+; CHECK-NEXT:    zip1 z29.b, z30.b, z29.b
+; CHECK-NEXT:    mov z30.b, z0.b[3]
+; CHECK-NEXT:    mov z13.b, z0.b[1]
+; CHECK-NEXT:    zip1 z31.b, z8.b, z31.b
+; CHECK-NEXT:    mov z8.b, z2.b[3]
 ; CHECK-NEXT:    mov z2.b, z2.b[1]
-; CHECK-NEXT:    strb w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #49]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[9]
-; CHECK-NEXT:    strb w8, [sp, #48]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[7]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[5]
-; CHECK-NEXT:    strb w9, [sp, #28]
-; CHECK-NEXT:    strb w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[3]
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    strb w8, [sp, #29]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[11]
-; CHECK-NEXT:    strb w8, [sp, #27]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[13]
-; CHECK-NEXT:    strb w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    strb w8, [sp, #25]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[7]
-; CHECK-NEXT:    strb w8, [sp, #23]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[5]
-; CHECK-NEXT:    strb w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[3]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    strb w8, [sp, #21]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #19]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strb w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strb w8, [sp, #17]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    add z0.b, z4.b, z0.b
-; CHECK-NEXT:    strb w8, [sp, #16]
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    add z1.b, z5.b, z1.b
+; CHECK-NEXT:    zip1 z9.b, z10.b, z9.b
+; CHECK-NEXT:    zip1 z10.b, z12.b, z11.b
+; CHECK-NEXT:    zip1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    zip1 z30.b, z13.b, z30.b
+; CHECK-NEXT:    mov z11.b, z0.b[13]
+; CHECK-NEXT:    mov z0.b, z0.b[5]
+; CHECK-NEXT:    zip1 z25.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z26.b, z28.b, z27.b
+; CHECK-NEXT:    zip1 z2.b, z2.b, z8.b
+; CHECK-NEXT:    zip1 z21.h, z22.h, z21.h
+; CHECK-NEXT:    zip1 z22.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z23.h, z31.h, z29.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z9.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z24.h, z10.h, z11.h
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z0.h, z30.h, z0.h
+; CHECK-NEXT:    zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z19.h, z25.h, z20.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z26.h
+; CHECK-NEXT:    zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT:    zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT:    zip1 z5.s, z16.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z23.s
+; CHECK-NEXT:    zip1 z6.s, z21.s, z17.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z24.s
+; CHECK-NEXT:    zip1 z7.s, z18.s, z22.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z19.s
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z1.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z2.d, z2.d, z7.d
+; CHECK-NEXT:    add z0.b, z3.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ldp d13, d12, [sp], #48 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1922,110 +1749,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov z4.h, z3.h[6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.h, z3.h[2]
-; CHECK-NEXT:    mov z5.h, z3.h[4]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z7.h, z2.h[6]
-; CHECK-NEXT:    mov z17.h, z2.h[7]
-; CHECK-NEXT:    mov z16.h, z3.h[1]
-; CHECK-NEXT:    strh w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z2.h[4]
-; CHECK-NEXT:    strh w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.h, z2.h[2]
-; CHECK-NEXT:    strh w8, [sp, #46]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    strh w9, [sp, #44]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    mov z7.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #38]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    strh w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp, #56]
-; CHECK-NEXT:    strh w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z0.h[4]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[2]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z3.h[7]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.h, z3.h[5]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    ldr q3, [sp, #32]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z2.h[5]
-; CHECK-NEXT:    ldr q4, [sp]
-; CHECK-NEXT:    strh w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.h, z1.h[7]
-; CHECK-NEXT:    strh w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[1]
-; CHECK-NEXT:    strh w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    strh w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z1.h[5]
-; CHECK-NEXT:    strh w9, [sp, #48]
-; CHECK-NEXT:    strh w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    ldp q1, q6, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x1]
+; CHECK-NEXT:    mov z3.h, z6.h[6]
+; CHECK-NEXT:    mov z4.h, z6.h[4]
+; CHECK-NEXT:    mov z5.h, z6.h[2]
+; CHECK-NEXT:    mov z7.h, z1.h[6]
+; CHECK-NEXT:    mov z16.h, z1.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[2]
+; CHECK-NEXT:    mov z18.h, z2.h[6]
+; CHECK-NEXT:    mov z19.h, z2.h[4]
+; CHECK-NEXT:    mov z20.h, z2.h[2]
+; CHECK-NEXT:    mov z21.h, z0.h[6]
+; CHECK-NEXT:    mov z22.h, z0.h[4]
+; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT:    zip1 z7.h, z1.h, z17.h
+; CHECK-NEXT:    zip1 z16.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z18.h, z2.h, z20.h
+; CHECK-NEXT:    mov z19.h, z0.h[2]
+; CHECK-NEXT:    zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT:    mov z20.h, z6.h[7]
+; CHECK-NEXT:    mov z21.h, z6.h[5]
+; CHECK-NEXT:    mov z22.h, z6.h[3]
+; CHECK-NEXT:    mov z6.h, z6.h[1]
+; CHECK-NEXT:    mov z23.h, z1.h[7]
+; CHECK-NEXT:    mov z24.h, z1.h[5]
+; CHECK-NEXT:    mov z25.h, z1.h[3]
 ; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    strh w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    add z0.h, z3.h, z0.h
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    add z1.h, z4.h, z1.h
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov z26.h, z2.h[7]
+; CHECK-NEXT:    mov z27.h, z2.h[5]
+; CHECK-NEXT:    mov z28.h, z2.h[3]
+; CHECK-NEXT:    mov z2.h, z2.h[1]
+; CHECK-NEXT:    mov z29.h, z0.h[7]
+; CHECK-NEXT:    mov z30.h, z0.h[5]
+; CHECK-NEXT:    mov z31.h, z0.h[3]
+; CHECK-NEXT:    mov z8.h, z0.h[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z19.h
+; CHECK-NEXT:    zip1 z19.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z6.h, z6.h, z22.h
+; CHECK-NEXT:    zip1 z20.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z25.h
+; CHECK-NEXT:    zip1 z21.h, z27.h, z26.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT:    zip1 z22.h, z30.h, z29.h
+; CHECK-NEXT:    zip1 z23.h, z8.h, z31.h
+; CHECK-NEXT:    zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT:    zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT:    zip1 z5.s, z18.s, z16.s
+; CHECK-NEXT:    zip1 z6.s, z6.s, z19.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z20.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z17.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z21.s
+; CHECK-NEXT:    zip1 z7.s, z23.s, z22.s
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z5.d
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT:    add z1.h, z3.h, z1.h
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v16i16:
@@ -2116,32 +1904,28 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q6, q0, [x0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
-; CHECK-NEXT:    ldp q4, q1, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z3.s, z0.s[2]
-; CHECK-NEXT:    mov z5.s, z1.s[2]
-; CHECK-NEXT:    stp s0, s3, [sp, #24]
-; CHECK-NEXT:    mov z3.s, z4.s[2]
-; CHECK-NEXT:    stp s5, s2, [sp, #12]
-; CHECK-NEXT:    mov z5.s, z0.s[3]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    stp s3, s1, [sp, #4]
-; CHECK-NEXT:    mov z1.s, z2.s[1]
-; CHECK-NEXT:    str s5, [sp, #44]
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z5.s, z0.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[2]
+; CHECK-NEXT:    mov z16.s, z1.s[2]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    mov z4.s, z6.s[1]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
 ; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    str s0, [sp, #40]
-; CHECK-NEXT:    ldp q3, q2, [sp]
-; CHECK-NEXT:    tbl z0.s, { z4.s }, z5.s
-; CHECK-NEXT:    str s1, [sp, #32]
-; CHECK-NEXT:    ldr q1, [sp, #32]
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    zip1 z7.s, z0.s, z16.s
+; CHECK-NEXT:    tbl z1.s, { z1.s }, z5.s
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    stp q1, q0, [x0]
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8f32:
@@ -2231,60 +2015,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[6]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z4.h, z1.h[2]
-; CHECK-NEXT:    mov z6.h, z0.h[4]
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z5.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.h, z1.h[7]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[5]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    ldp q3, q0, [sp]
-; CHECK-NEXT:    add z0.h, z3.h, z0.h
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    mov z2.h, z0.h[6]
+; CHECK-NEXT:    mov z3.h, z0.h[4]
+; CHECK-NEXT:    mov z4.h, z0.h[2]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z6.h, z1.h[4]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z16.h, z0.h[7]
+; CHECK-NEXT:    mov z17.h, z0.h[5]
+; CHECK-NEXT:    mov z18.h, z0.h[3]
+; CHECK-NEXT:    mov z19.h, z0.h[1]
+; CHECK-NEXT:    mov z20.h, z1.h[7]
+; CHECK-NEXT:    mov z21.h, z1.h[5]
+; CHECK-NEXT:    mov z22.h, z1.h[3]
+; CHECK-NEXT:    mov z23.h, z1.h[1]
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z7.h
+; CHECK-NEXT:    zip1 z4.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z6.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i16:
@@ -2341,31 +2103,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-LABEL: uzp_v8i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z3.s, z1.s[2]
-; CHECK-NEXT:    mov z4.s, z0.s[3]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.s, z1.s[3]
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    stp w9, w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    stp w9, w8, [sp, #16]
-; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z2.s, z1.s[2]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z4.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z0.s[3]
+; CHECK-NEXT:    mov z7.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT:    zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q0, [x0]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i32_undef:

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 88c83a214c7394..c942f1eca8ebaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, z0.s[3]
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %el0 = extractelement <vscale x 4 x i1> %a, i32 0
   %el1 = extractelement <vscale x 4 x i1> %a, i32 1


        


More information about the llvm-commits mailing list