[llvm] [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (PR #111698)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 10 06:48:20 PDT 2024


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/111698

>From b38040c060e1346c1c7228c0b796e20d9ca3b2d9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 9 Oct 2024 13:56:49 +0000
Subject: [PATCH 1/4] [AArch64][SVE] Support lowering fixed-length
 BUILD_VECTORS to ZIPs

This allows lowering fixed-length (non-constant) BUILD_VECTORS
(<= 128-bit) to a chain of ZIP1 instructions when Neon is not available,
rather than using the default lowering, which is to spill to the stack
and reload.

For example,

```
t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3)
```

Becomes:

```
zip1 z0.s, z0.s, z1.s // z0 = t0,t1,...
zip1 z2.s, z2.s, z3.s // z2 = t2,t3,...
zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,...
```

When values are already in FRPs, this generally seems to lead to a more
compact output with less movement to/from the stack.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   65 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    1 +
 .../sve-fixed-length-vector-shuffle-tbl.ll    |  265 ++--
 ...streaming-mode-fixed-length-and-combine.ll |    8 +-
 ...sve-streaming-mode-fixed-length-bitcast.ll |   11 +-
 ...treaming-mode-fixed-length-build-vector.ll |  255 ++++
 .../sve-streaming-mode-fixed-length-concat.ll |   70 +-
 ...e-streaming-mode-fixed-length-ext-loads.ll |   44 +-
 ...ing-mode-fixed-length-extract-subvector.ll |   32 +-
 ...e-streaming-mode-fixed-length-fcopysign.ll |   52 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll |  692 +++++-----
 ...-streaming-mode-fixed-length-fp-vselect.ll |   13 +-
 ...ing-mode-fixed-length-insert-vector-elt.ll |   10 +-
 ...e-streaming-mode-fixed-length-int-to-fp.ll |   32 +-
 ...-streaming-mode-fixed-length-ld2-alloca.ll |   54 +-
 ...streaming-mode-fixed-length-masked-load.ll |  218 ++--
 ...treaming-mode-fixed-length-masked-store.ll |  198 +--
 ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1140 +++++++----------
 ...e-streaming-mode-fixed-length-reshuffle.ll |   16 +-
 19 files changed, 1518 insertions(+), 1658 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 288fd3639e5eb7..6c1c33da4be996 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -2102,7 +2103,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
   setOperationAction(ISD::BITREVERSE, VT, Default);
   setOperationAction(ISD::BSWAP, VT, Default);
-  setOperationAction(ISD::BUILD_VECTOR, VT, Default);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
   setOperationAction(ISD::CTLZ, VT, Default);
   setOperationAction(ISD::CTPOP, VT, Default);
@@ -14384,24 +14385,62 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  auto *BVN = cast<BuildVectorSDNode>(Op);
 
-  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
-    if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
-      SDLoc DL(Op);
-      EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-      SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
-      SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
-      SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
-      return convertFromScalableVector(DAG, Op.getValueType(), Seq);
-    }
+  if (auto SeqInfo = BVN->isConstantSequence()) {
+    SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
+    SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
+    SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
+    return convertFromScalableVector(DAG, VT, Seq);
+  }
 
-    // Revert to common legalisation for all other variants.
+  if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
+      VT.getVectorNumElements() <= 1 || BVN->isConstant())
     return SDValue();
+
+  // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
+  EVT ZipVT = ContainerVT;
+  SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
+  SmallVector<SDValue, 16> Intermediates =
+      llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
+        return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
+                           DAG.getUNDEF(ZipVT), Op, ZeroI64);
+      });
+
+  while (Intermediates.size() > 1) {
+    auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
+    for (unsigned I = 0; I < Intermediates.size(); I += 2) {
+      SDValue Op0 = ToZipVT(Intermediates[I + 0]);
+      SDValue Op1 = ToZipVT(Intermediates[I + 1]);
+      Intermediates[I / 2] = DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+    }
+
+    Intermediates.resize(Intermediates.size() / 2);
+    if (Intermediates.size() > 1) {
+      // Prefer FP values to keep elements within vector registers (and also as
+      // f16 is conveniently a legal type).
+      ZipVT = getPackedSVEVectorVT(EVT::getFloatingPointVT(
+          ZipVT.getVectorElementType().getSizeInBits() * 2));
+    }
   }
 
+  assert(Intermediates.size() == 1);
+  SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
+  return convertFromScalableVector(DAG, VT, Vec);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+    return LowerFixedLengthBuildVectorToSVE(Op, DAG);
+
   // Try to build a simple constant vector.
   Op = NormalizeBuildVector(Op, DAG);
   // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1bae7562f459a5..95489f85631801 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1256,6 +1256,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
                                               SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 276f23703df3df..a22c00c1ebce14 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s3
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s0
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s3
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s0
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #15]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s3
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s3
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s0
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[7]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z6.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -263,89 +230,62 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s1
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s2
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_128_NOMAX-NEXT:    fmov w8, s0
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_128_NOMAX-NEXT:    fmov w9, s1
-; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
+; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w8, s0
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT:    fmov w9, s1
-; SVE2_NOMIN_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT:    .cfi_def_cfa_offset 16
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d3, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z3.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w8, s0
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT:    fmov w9, s1
-; SVE2_MIN_256_NOMAX-NEXT:    strb w8, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT:    strb w9, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -401,34 +341,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
 define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
 ; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    mov z1.b, z0.b[7]
-; CHECK-NEXT:    mov z2.b, z0.b[6]
-; CHECK-NEXT:    mov z3.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strb w9, [sp, #11]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w9, [sp, #9]
-; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z4.b, z0.b[1]
+; CHECK-NEXT:    mov z1.b, z1.b[1]
+; CHECK-NEXT:    mov z5.b, z0.b[7]
+; CHECK-NEXT:    mov z6.b, z0.b[6]
+; CHECK-NEXT:    mov z0.b, z0.b[4]
+; CHECK-NEXT:    zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index 617b560713c3ab..478072d33d8c9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-LABEL: vls_sve_and_2xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    stp wzr, w8, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index b9264ad5f77c37..172e2454d70283 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -91,19 +91,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: bitcast_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x1]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index b8a2e0e0f4bd4c..9166dcbf62c4ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -222,3 +222,258 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
+
+define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, w1, lsl #1
+; CHECK-NEXT:    orr w8, w8, w2, lsl #2
+; CHECK-NEXT:    orr w8, w8, w3, lsl #3
+; CHECK-NEXT:    strb w8, [x4]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr w8, w0, w1, lsl #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w2, lsl #2
+; NONEON-NOSVE-NEXT:    orr w8, w8, w3, lsl #3
+; NONEON-NOSVE-NEXT:    strb w8, [x4]
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+  %2 = insertelement <4 x i1>    %1, i1 %b, i64 1
+  %3 = insertelement <4 x i1>    %2, i1 %c, i64 2
+  %4 = insertelement <4 x i1>    %3, i1 %d, i64 3
+  store <4 x i1> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x double> undef, double %a, i64 0
+  %2 = insertelement <2 x double>    %1, double %b, i64 1
+  store <2 x double> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x float> undef, float %a, i64 0
+  %2 = insertelement <2 x float>    %1, float %b, i64 1
+  store <2 x float> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out)  {
+; CHECK-LABEL: build_vector_non_const_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    // kill: def $s3 killed $s3 def $z3
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s2, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x float> undef, float %a, i64 0
+  %2 = insertelement <4 x float>    %1, float %b, i64 1
+  %3 = insertelement <4 x float>    %2, float %c, i64 2
+  %4 = insertelement <4 x float>    %3, float %d, i64 3
+  store <4 x float> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out)  {
+; CHECK-LABEL: build_vector_non_const_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $z3
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d2, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <4 x double> undef, double %a, i64 0
+  %2 = insertelement <4 x double>    %1, double %b, i64 1
+  %3 = insertelement <4 x double>    %2, double %c, i64 2
+  %4 = insertelement <4 x double>    %3, double %d, i64 3
+  store <4 x double> %4, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h6 killed $h6 def $z6
+; CHECK-NEXT:    // kill: def $h4 killed $h4 def $z4
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    // kill: def $h7 killed $h7 def $z7
+; CHECK-NEXT:    // kill: def $h5 killed $h5 def $z5
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $z3
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $z1
+; CHECK-NEXT:    zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z1.s, z4.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h7, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h6, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h5, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h4, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  %2 = insertelement <8 x half>    %1, half %b, i64 1
+  %3 = insertelement <8 x half>    %2, half %c, i64 2
+  %4 = insertelement <8 x half>    %3, half %d, i64 3
+  %5 = insertelement <8 x half>    %4, half %e, i64 4
+  %6 = insertelement <8 x half>    %5, half %f, i64 5
+  %7 = insertelement <8 x half>    %6, half %g, i64 6
+  %8 = insertelement <8 x half>    %7, half %h, i64 7
+  store <8 x half> %8, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    str d0, [x2]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <2 x i32> undef, i32 %a, i64 0
+  %2 = insertelement <2 x i32>    %1, i32 %b, i64 1
+  store <2 x i32> %2, ptr %out
+  ret void
+}
+
+define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w7
+; CHECK-NEXT:    fmov s1, w6
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    fmov s2, w4
+; CHECK-NEXT:    fmov s3, w3
+; CHECK-NEXT:    fmov s4, w2
+; CHECK-NEXT:    fmov s5, w1
+; CHECK-NEXT:    fmov s6, w0
+; CHECK-NEXT:    zip1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    fmov s1, w5
+; CHECK-NEXT:    zip1 z1.b, z2.b, z1.b
+; CHECK-NEXT:    zip1 z2.b, z4.b, z3.b
+; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    zip1 z1.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %1 = insertelement <8 x i8> undef, i8 %a, i64 0
+  %2 = insertelement <8 x i8>    %1, i8 %b, i64 1
+  %3 = insertelement <8 x i8>    %2, i8 %c, i64 2
+  %4 = insertelement <8 x i8>    %3, i8 %d, i64 3
+  %5 = insertelement <8 x i8>    %4, i8 %e, i64 4
+  %6 = insertelement <8 x i8>    %5, i8 %f, i64 5
+  %7 = insertelement <8 x i8>    %6, i8 %g, i64 6
+  %8 = insertelement <8 x i8>    %7, i8 %h, i64 7
+  store <8 x i8> %8, ptr %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 4b6285b2732fe5..c1810c678ea522 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-LABEL: concat_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    mov z2.h, z1.h[3]
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z2.h, z1.h[3]
 ; CHECK-NEXT:    mov z3.h, z1.h[2]
-; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z0.h[3]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z7.h, z0.h[1]
+; CHECK-NEXT:    zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT:    zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z7.b
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i8:
@@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-LABEL: concat_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z2.s, z1.s[1]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i16:
@@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-LABEL: concat_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    str h1, [sp, #12]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h2, [sp, #14]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 50a05cb4b1e277..7d6336a43a4fd1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-LABEL: load_sext_v2i64i256:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    asr x9, x8, #63
-; CHECK-NEXT:    fmov x10, d1
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    asr x8, x10, #63
-; CHECK-NEXT:    mov z0.d, x9
-; CHECK-NEXT:    stp x10, x8, [sp, #16]
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ldp q2, q4, [sp], #32
-; CHECK-NEXT:    mov z3.d, z0.d[1]
-; CHECK-NEXT:    mov z5.d, z1.d[1]
-; CHECK-NEXT:    mov z6.d, z2.d[1]
-; CHECK-NEXT:    fmov x2, d0
-; CHECK-NEXT:    mov z0.d, z4.d[1]
-; CHECK-NEXT:    fmov x6, d1
-; CHECK-NEXT:    fmov x0, d2
-; CHECK-NEXT:    fmov x4, d4
-; CHECK-NEXT:    fmov x3, d3
-; CHECK-NEXT:    fmov x7, d5
-; CHECK-NEXT:    fmov x1, d6
-; CHECK-NEXT:    fmov x5, d0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    asr x8, x8, #63
+; CHECK-NEXT:    fmov d3, x8
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, x9
+; CHECK-NEXT:    fmov x2, d2
+; CHECK-NEXT:    zip1 z1.d, z1.d, z4.d
+; CHECK-NEXT:    mov z4.d, z2.d[1]
+; CHECK-NEXT:    mov z5.d, z0.d[1]
+; CHECK-NEXT:    mov z6.d, z3.d[1]
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    fmov x6, d3
+; CHECK-NEXT:    mov z2.d, z1.d[1]
+; CHECK-NEXT:    fmov x3, d4
+; CHECK-NEXT:    fmov x1, d5
+; CHECK-NEXT:    fmov x4, d1
+; CHECK-NEXT:    fmov x7, d6
+; CHECK-NEXT:    fmov x5, d2
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 2665696308463f..a728cbe97056db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-LABEL: extract_subvector_v8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
@@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: extract_subvector_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index dad53b31db0b0f..f1771a753826cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f16_v4f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    sub sp, sp, #16
-; SVE-NEXT:    .cfi_def_cfa_offset 16
-; SVE-NEXT:    ldp q1, q0, [x1]
-; SVE-NEXT:    ldr d4, [x0]
-; SVE-NEXT:    and z4.h, z4.h, #0x7fff
-; SVE-NEXT:    mov z2.d, z0.d[1]
-; SVE-NEXT:    mov z3.d, z1.d[1]
-; SVE-NEXT:    fcvt h0, d0
+; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    mov z2.d, z1.d[1]
+; SVE-NEXT:    mov z3.d, z0.d[1]
 ; SVE-NEXT:    fcvt h1, d1
+; SVE-NEXT:    fcvt h0, d0
 ; SVE-NEXT:    fcvt h2, d2
 ; SVE-NEXT:    fcvt h3, d3
-; SVE-NEXT:    str h0, [sp, #12]
-; SVE-NEXT:    str h1, [sp, #8]
-; SVE-NEXT:    str h2, [sp, #14]
-; SVE-NEXT:    str h3, [sp, #10]
-; SVE-NEXT:    ldr d0, [sp, #8]
+; SVE-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
-; SVE-NEXT:    orr z0.d, z4.d, z0.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str d0, [x0]
-; SVE-NEXT:    add sp, sp, #16
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f16_v4f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    sub sp, sp, #16
-; SVE2-NEXT:    .cfi_def_cfa_offset 16
-; SVE2-NEXT:    ldp q2, q1, [x1]
-; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT:    ldr d5, [x0]
-; SVE2-NEXT:    mov z3.d, z1.d[1]
-; SVE2-NEXT:    mov z4.d, z2.d[1]
+; SVE2-NEXT:    ldp q0, q1, [x1]
+; SVE2-NEXT:    mov z2.d, z1.d[1]
+; SVE2-NEXT:    mov z3.d, z0.d[1]
 ; SVE2-NEXT:    fcvt h1, d1
+; SVE2-NEXT:    fcvt h0, d0
 ; SVE2-NEXT:    fcvt h2, d2
 ; SVE2-NEXT:    fcvt h3, d3
-; SVE2-NEXT:    fcvt h4, d4
-; SVE2-NEXT:    str h1, [sp, #12]
-; SVE2-NEXT:    str h2, [sp, #8]
-; SVE2-NEXT:    str h3, [sp, #14]
-; SVE2-NEXT:    str h4, [sp, #10]
-; SVE2-NEXT:    ldr d1, [sp, #8]
-; SVE2-NEXT:    bsl z5.d, z5.d, z1.d, z0.d
-; SVE2-NEXT:    str d5, [x0]
-; SVE2-NEXT:    add sp, sp, #16
+; SVE2-NEXT:    zip1 z1.h, z1.h, z2.h
+; SVE2-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2-NEXT:    mov z2.h, #32767 // =0x7fff
+; SVE2-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE2-NEXT:    ldr d1, [x0]
+; SVE2-NEXT:    bsl z1.d, z1.d, z0.d, z2.d
+; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index a206fbc5102953..11fee267660c03 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
@@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    fcvtzu x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[2]
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x10, h0
+; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    fcvtzu x9, h2
+; CHECK-NEXT:    fcvtzu x11, h3
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h3
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x11, h1
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    fcvtzu x8, h2
 ; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    stp x12, x8, [sp, #32]
-; CHECK-NEXT:    stp x10, x9, [sp, #48]
-; CHECK-NEXT:    ldp q1, q0, [sp, #32]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    mov z6.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    fcvtzu x14, h1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    fcvtzu x12, h5
+; CHECK-NEXT:    fcvtzu x13, h6
+; CHECK-NEXT:    fcvtzu x15, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d4, x13
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fmov d3, x14
+; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT:    fmov d4, x15
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT:    stp q3, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    mov z5.d, z1.d
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z6.h, z1.h[3]
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    mov z7.h, z0.h[1]
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    mov z4.h, z2.h[1]
-; CHECK-NEXT:    fcvtzu x8, h2
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[2]
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    fcvtzu x9, h4
-; CHECK-NEXT:    mov z4.h, z3.h[1]
-; CHECK-NEXT:    fcvtzu x10, h5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    fcvtzu x11, h2
-; CHECK-NEXT:    mov z2.h, z3.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    fcvtzu x8, h4
-; CHECK-NEXT:    fcvtzu x9, h5
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    fcvtzu x12, h6
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    stp x12, x8, [sp, #64]
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    fcvtzu x8, h4
-; CHECK-NEXT:    stp x10, x9, [sp, #80]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x12, [sp, #32]
-; CHECK-NEXT:    fcvtzu x11, h2
-; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    stp x9, x8, [sp, #48]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp]
-; CHECK-NEXT:    ldp q3, q4, [sp, #64]
-; CHECK-NEXT:    stp x10, x11, [sp, #96]
-; CHECK-NEXT:    ldp q6, q7, [sp, #32]
-; CHECK-NEXT:    stp x8, x12, [sp, #112]
-; CHECK-NEXT:    ldp q5, q2, [sp, #96]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #96]
-; CHECK-NEXT:    stp q5, q2, [x1, #64]
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    fmov d16, x9
+; CHECK-NEXT:    mov z2.h, z3.h[3]
+; CHECK-NEXT:    mov z4.h, z5.h[3]
+; CHECK-NEXT:    fcvtzu x14, h3
+; CHECK-NEXT:    fcvtzu x13, h1
+; CHECK-NEXT:    fcvtzu x15, h5
+; CHECK-NEXT:    mov z1.h, z3.h[1]
+; CHECK-NEXT:    mov z6.h, z5.h[1]
+; CHECK-NEXT:    mov z5.h, z5.h[2]
+; CHECK-NEXT:    mov z3.h, z3.h[2]
+; CHECK-NEXT:    fcvtzu x9, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fcvtzu x10, h4
+; CHECK-NEXT:    fmov d4, x11
+; CHECK-NEXT:    fcvtzu x11, h7
+; CHECK-NEXT:    fmov d7, x12
+; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    fcvtzu x13, h1
+; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    fcvtzu x14, h6
+; CHECK-NEXT:    fmov d6, x15
+; CHECK-NEXT:    fcvtzu x15, h5
+; CHECK-NEXT:    fmov d5, x9
+; CHECK-NEXT:    fcvtzu x9, h3
+; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT:    fmov d16, x8
+; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT:    fmov d3, x12
+; CHECK-NEXT:    fmov d7, x10
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT:    fmov d16, x15
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    fmov d2, x13
+; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    stp q0, q7, [x1, #96]
+; CHECK-NEXT:    stp q1, q4, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    mov z2.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z1.s[1]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
@@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z3.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
@@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q5, q6, [x0, #96]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    ldp q0, q4, [x0, #32]
+; CHECK-NEXT:    ldp q2, q7, [x0, #64]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z16.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z3.s[1]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT:    mov z3.s, z5.s[1]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    mov z17.s, z6.s[1]
+; CHECK-NEXT:    mov z16.s, z4.s[1]
+; CHECK-NEXT:    mov z18.s, z5.s[1]
+; CHECK-NEXT:    mov z21.s, z0.s[1]
+; CHECK-NEXT:    mov z19.s, z7.s[1]
+; CHECK-NEXT:    mov z20.s, z2.s[1]
+; CHECK-NEXT:    mov z22.s, z3.s[1]
+; CHECK-NEXT:    mov z23.s, z1.s[1]
+; CHECK-NEXT:    zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT:    zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT:    zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
@@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
@@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    fcvtzs x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[2]
+; CHECK-NEXT:    mov z3.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x10, h0
+; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fcvtzs x11, h3
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h3
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x11, h1
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    fcvtzs x8, h2
 ; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    stp x12, x8, [sp, #32]
-; CHECK-NEXT:    stp x10, x9, [sp, #48]
-; CHECK-NEXT:    ldp q1, q0, [sp, #32]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    mov z6.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    fcvtzs x14, h1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    fcvtzs x12, h5
+; CHECK-NEXT:    fcvtzs x13, h6
+; CHECK-NEXT:    fcvtzs x15, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d4, x13
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fmov d3, x14
+; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT:    fmov d4, x15
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT:    stp q3, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    mov z5.d, z1.d
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z1.h[1]
+; CHECK-NEXT:    mov z6.h, z1.h[3]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov z7.h, z0.h[1]
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    mov z4.h, z2.h[1]
-; CHECK-NEXT:    fcvtzs x8, h2
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[2]
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    fcvtzs x9, h4
-; CHECK-NEXT:    mov z4.h, z3.h[1]
-; CHECK-NEXT:    fcvtzs x10, h5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    fcvtzs x11, h2
-; CHECK-NEXT:    mov z2.h, z3.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    fcvtzs x8, h4
-; CHECK-NEXT:    fcvtzs x9, h5
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    fcvtzs x12, h6
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    stp x12, x8, [sp, #64]
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    fcvtzs x8, h4
-; CHECK-NEXT:    stp x10, x9, [sp, #80]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z3.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x12, [sp, #32]
-; CHECK-NEXT:    fcvtzs x11, h2
-; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    stp x9, x8, [sp, #48]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp]
-; CHECK-NEXT:    ldp q3, q4, [sp, #64]
-; CHECK-NEXT:    stp x10, x11, [sp, #96]
-; CHECK-NEXT:    ldp q6, q7, [sp, #32]
-; CHECK-NEXT:    stp x8, x12, [sp, #112]
-; CHECK-NEXT:    ldp q5, q2, [sp, #96]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #96]
-; CHECK-NEXT:    stp q5, q2, [x1, #64]
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    fmov d16, x9
+; CHECK-NEXT:    mov z2.h, z3.h[3]
+; CHECK-NEXT:    mov z4.h, z5.h[3]
+; CHECK-NEXT:    fcvtzs x14, h3
+; CHECK-NEXT:    fcvtzs x13, h1
+; CHECK-NEXT:    fcvtzs x15, h5
+; CHECK-NEXT:    mov z1.h, z3.h[1]
+; CHECK-NEXT:    mov z6.h, z5.h[1]
+; CHECK-NEXT:    mov z5.h, z5.h[2]
+; CHECK-NEXT:    mov z3.h, z3.h[2]
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    fmov d4, x11
+; CHECK-NEXT:    fcvtzs x11, h7
+; CHECK-NEXT:    fmov d7, x12
+; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    fcvtzs x13, h1
+; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    fcvtzs x14, h6
+; CHECK-NEXT:    fmov d6, x15
+; CHECK-NEXT:    fcvtzs x15, h5
+; CHECK-NEXT:    fmov d5, x9
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT:    fmov d16, x8
+; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT:    fmov d3, x12
+; CHECK-NEXT:    fmov d7, x10
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    fmov d4, x9
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT:    fmov d16, x15
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    fmov d2, x13
+; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    stp q0, q7, [x1, #96]
+; CHECK-NEXT:    stp q1, q4, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    mov z2.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z1.s[1]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    mov z3.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
@@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z3.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
@@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q5, q6, [x0, #96]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    ldp q0, q4, [x0, #32]
+; CHECK-NEXT:    ldp q2, q7, [x0, #64]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z16.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z3.s[1]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT:    mov z3.s, z5.s[1]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z2.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    mov z17.s, z6.s[1]
+; CHECK-NEXT:    mov z16.s, z4.s[1]
+; CHECK-NEXT:    mov z18.s, z5.s[1]
+; CHECK-NEXT:    mov z21.s, z0.s[1]
+; CHECK-NEXT:    mov z19.s, z7.s[1]
+; CHECK-NEXT:    mov z20.s, z2.s[1]
+; CHECK-NEXT:    mov z22.s, z3.s[1]
+; CHECK-NEXT:    mov z23.s, z1.s[1]
+; CHECK-NEXT:    zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT:    zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT:    zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT:    zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT:    zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 035c76b569298a..e3c89981cb27af 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -8,25 +8,20 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) {
 ; CHECK-LABEL: select_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
-; CHECK-NEXT:    mov z3.s, z2.s[1]
-; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z4.s, z2.s[1]
+; CHECK-NEXT:    zip1 z3.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d2, [sp, #8]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
 ; CHECK-NEXT:    lsl z2.h, z2.h, #15
 ; CHECK-NEXT:    asr z2.h, z2.h, #15
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index d77473ed8f08e5..87e3d0d09817ba 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -506,14 +506,12 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-LABEL: insertelement_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    fmov h1, #5.00000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index afd3bb7161c155..f71bfb770b15f4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1160,18 +1160,16 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    mov z2.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    fmov x9, d2
 ; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    ucvtf h1, x8
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ucvtf h2, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
@@ -2618,18 +2616,16 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: scvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    mov z2.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    fmov x9, d2
 ; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    scvtf h1, x8
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    str h1, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    scvtf h2, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 270f05a806b82d..ef6b1c9acbf105 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -10,25 +10,20 @@ declare void @def(ptr)
 define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #28
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    add x20, sp, #28
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x20, sp, #12
 ; CHECK-NEXT:    bl def
 ; CHECK-NEXT:    ptrue p0.b, vl2
 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    mov z2.b, z0.b[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: alloc_v4i8:
@@ -62,32 +57,29 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v6i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x0, sp, #8
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    add x8, sp, #4
 ; CHECK-NEXT:    ptrue p1.s, vl2
 ; CHECK-NEXT:    mov z1.b, z0.b[3]
-; CHECK-NEXT:    mov z2.b, z0.b[5]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    add x8, sp, #20
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    st1b { z0.h }, p0, [x8]
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x8]
-; CHECK-NEXT:    strb w9, [x19, #2]
+; CHECK-NEXT:    mov z2.b, z0.b[1]
+; CHECK-NEXT:    mov z0.b, z0.b[5]
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z2.h, z0.h, z0.h
+; CHECK-NEXT:    zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    st1b { z1.h }, p0, [x8]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x8]
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x19, #2]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    strh w8, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: alloc_v6i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 5f4b9dd1592cf2..1b90aed22f9d8d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -676,79 +676,105 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldr w8, [sp, #224]
-; CHECK-NEXT:    ldr w9, [sp, #216]
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    strb w7, [sp, #6]
-; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    ldr w8, [sp, #208]
-; CHECK-NEXT:    strb w9, [sp, #30]
-; CHECK-NEXT:    ldr w9, [sp, #200]
-; CHECK-NEXT:    strb w8, [sp, #29]
 ; CHECK-NEXT:    ldr w8, [sp, #192]
-; CHECK-NEXT:    strb w9, [sp, #28]
 ; CHECK-NEXT:    ldr w9, [sp, #184]
-; CHECK-NEXT:    strb w8, [sp, #27]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldr w10, [sp, #160]
+; CHECK-NEXT:    ldr w11, [sp, #144]
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    strb w9, [sp, #26]
+; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    strb w8, [sp, #25]
-; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    strb w9, [sp, #24]
-; CHECK-NEXT:    ldr w9, [sp, #152]
-; CHECK-NEXT:    strb w8, [sp, #23]
-; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    fmov s3, w10
+; CHECK-NEXT:    fmov s4, w11
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    ldr w8, [sp, #152]
+; CHECK-NEXT:    ldr w10, [sp, #112]
+; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    ldr w9, [sp, #136]
-; CHECK-NEXT:    strb w8, [sp, #21]
+; CHECK-NEXT:    ldr w11, [sp, #96]
+; CHECK-NEXT:    fmov s5, w8
 ; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    strb w9, [sp, #20]
+; CHECK-NEXT:    zip1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    fmov s7, w9
 ; CHECK-NEXT:    ldr w9, [sp, #120]
-; CHECK-NEXT:    strb w8, [sp, #19]
-; CHECK-NEXT:    ldr w8, [sp, #112]
-; CHECK-NEXT:    strb w9, [sp, #18]
-; CHECK-NEXT:    ldr w9, [sp, #104]
-; CHECK-NEXT:    strb w8, [sp, #17]
-; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    strb w9, [sp, #16]
+; CHECK-NEXT:    fmov s18, w10
+; CHECK-NEXT:    fmov s16, w8
+; CHECK-NEXT:    ldr w8, [sp, #104]
+; CHECK-NEXT:    zip1 z2.b, z6.b, z2.b
+; CHECK-NEXT:    fmov s17, w9
 ; CHECK-NEXT:    ldr w9, [sp, #88]
-; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    fmov s20, w11
+; CHECK-NEXT:    fmov s19, w8
 ; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    strb w9, [sp, #14]
+; CHECK-NEXT:    ldr w10, [sp, #64]
+; CHECK-NEXT:    fmov s21, w9
 ; CHECK-NEXT:    ldr w9, [sp, #72]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    ldr w9, [sp, #56]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    strb w9, [sp, #10]
+; CHECK-NEXT:    ldr w11, [sp, #48]
+; CHECK-NEXT:    fmov s22, w8
+; CHECK-NEXT:    ldr w8, [sp, #56]
+; CHECK-NEXT:    zip1 z3.b, z5.b, z3.b
+; CHECK-NEXT:    fmov s23, w9
 ; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    strb w8, [sp, #9]
+; CHECK-NEXT:    zip1 z4.b, z7.b, z4.b
+; CHECK-NEXT:    fmov s25, w8
 ; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    fmov s24, w10
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    ldr w9, [sp, #24]
+; CHECK-NEXT:    fmov s26, w11
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    ldr w8, [sp, #16]
+; CHECK-NEXT:    zip1 z16.b, z17.b, z16.b
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    ldr w9, [sp, #8]
+; CHECK-NEXT:    zip1 z17.b, z19.b, z18.b
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    ldr w8, [sp]
+; CHECK-NEXT:    zip1 z19.b, z21.b, z20.b
+; CHECK-NEXT:    fmov s18, w9
+; CHECK-NEXT:    zip1 z20.b, z23.b, z22.b
+; CHECK-NEXT:    fmov s23, w7
+; CHECK-NEXT:    fmov s22, w8
+; CHECK-NEXT:    zip1 z21.b, z25.b, z24.b
+; CHECK-NEXT:    zip1 z1.b, z1.b, z26.b
+; CHECK-NEXT:    zip1 z5.b, z5.b, z6.b
+; CHECK-NEXT:    fmov s24, w3
+; CHECK-NEXT:    fmov s25, w2
+; CHECK-NEXT:    zip1 z6.b, z18.b, z7.b
+; CHECK-NEXT:    fmov s18, w6
+; CHECK-NEXT:    fmov s26, w1
+; CHECK-NEXT:    zip1 z7.b, z23.b, z22.b
+; CHECK-NEXT:    fmov s22, w5
+; CHECK-NEXT:    fmov s23, w4
+; CHECK-NEXT:    zip1 z0.h, z2.h, z0.h
+; CHECK-NEXT:    zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z4.h, z20.h, z19.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z21.h
+; CHECK-NEXT:    zip1 z5.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z18.b, z22.b, z18.b
+; CHECK-NEXT:    zip1 z22.b, z24.b, z23.b
 ; CHECK-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEXT:    strb w6, [sp, #5]
-; CHECK-NEXT:    strb w5, [sp, #4]
-; CHECK-NEXT:    strb w4, [sp, #3]
-; CHECK-NEXT:    strb w3, [sp, #2]
-; CHECK-NEXT:    strb w2, [sp, #1]
-; CHECK-NEXT:    strb w1, [sp]
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    zip1 z23.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z0.s, z2.s, z0.s
+; CHECK-NEXT:    zip1 z2.s, z4.s, z3.s
+; CHECK-NEXT:    zip1 z1.s, z5.s, z1.s
+; CHECK-NEXT:    zip1 z6.h, z18.h, z7.h
+; CHECK-NEXT:    zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT:    zip1 z0.d, z2.d, z0.d
+; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
-; CHECK-NEXT:    lsl z1.b, z1.b, #7
+; CHECK-NEXT:    zip1 z1.d, z3.d, z1.d
 ; CHECK-NEXT:    asr z0.b, z0.b, #7
-; CHECK-NEXT:    asr z1.b, z1.b, #7
+; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
@@ -1466,23 +1492,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    fmov s1, wzr
+; CHECK-NEXT:    mov z2.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f16:
@@ -2318,33 +2339,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z1.b, z0.b[3]
 ; CHECK-NEXT:    mov z2.b, z0.b[2]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    mov z3.b, z0.b[1]
 ; CHECK-NEXT:    mov z4.b, z0.b[7]
-; CHECK-NEXT:    strh w8, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[5]
-; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    ldp d0, d1, [sp]
+; CHECK-NEXT:    mov z5.b, z0.b[6]
+; CHECK-NEXT:    mov z6.b, z0.b[5]
+; CHECK-NEXT:    mov z7.b, z0.b[4]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z2.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    zip1 z1.s, z3.s, z2.s
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
@@ -2357,7 +2366,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f32:
@@ -2684,23 +2692,22 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-LABEL: masked_load_zext_v3i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    fmov s0, w2
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    fmov s2, w3
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    strh w2, [sp, #10]
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    strh w1, [sp, #8]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
@@ -2759,23 +2766,22 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-LABEL: masked_load_sext_v3i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    fmov s0, w2
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    fmov s2, w3
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    strh w2, [sp, #10]
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    strh w1, [sp, #8]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 0c3411e5f55148..2966ab12b8cad6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -293,78 +293,104 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    ldr w9, [sp, #88]
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldr w10, [sp, #120]
-; CHECK-NEXT:    strb w7, [sp, #6]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    ldr w9, [sp, #72]
-; CHECK-NEXT:    strb w8, [sp, #13]
 ; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    strb w9, [sp, #12]
 ; CHECK-NEXT:    ldr w9, [sp, #56]
-; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    fmov s26, w2
+; CHECK-NEXT:    ldr w10, [sp, #32]
+; CHECK-NEXT:    ldr w11, [sp, #16]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    strb w9, [sp, #10]
+; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    ldr w9, [sp, #216]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    ldr w8, [sp, #224]
-; CHECK-NEXT:    strb w9, [sp, #30]
-; CHECK-NEXT:    ldr w9, [sp, #200]
-; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    ldr w8, [sp, #208]
-; CHECK-NEXT:    strb w9, [sp, #28]
+; CHECK-NEXT:    fmov s5, w10
+; CHECK-NEXT:    fmov s7, w11
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    ldr w8, [sp, #24]
+; CHECK-NEXT:    ldr w10, [sp, #176]
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    ldr w9, [sp, #8]
+; CHECK-NEXT:    ldr w11, [sp, #168]
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    ldr w8, [sp]
+; CHECK-NEXT:    fmov s19, w10
+; CHECK-NEXT:    fmov s16, w9
 ; CHECK-NEXT:    ldr w9, [sp, #184]
-; CHECK-NEXT:    strb w8, [sp, #29]
+; CHECK-NEXT:    fmov s20, w11
+; CHECK-NEXT:    zip1 z4.b, z3.b, z2.b
+; CHECK-NEXT:    fmov s3, w8
 ; CHECK-NEXT:    ldr w8, [sp, #192]
-; CHECK-NEXT:    strb w9, [sp, #26]
-; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    strb w8, [sp, #27]
-; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    strb w9, [sp, #24]
+; CHECK-NEXT:    fmov s18, w9
 ; CHECK-NEXT:    ldr w9, [sp, #152]
-; CHECK-NEXT:    strb w8, [sp, #25]
+; CHECK-NEXT:    ldr w10, [sp, #136]
+; CHECK-NEXT:    fmov s17, w8
 ; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    strb w9, [sp, #22]
-; CHECK-NEXT:    ldr w9, [sp, #136]
-; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    ldr w11, [sp, #120]
+; CHECK-NEXT:    fmov s21, w10
+; CHECK-NEXT:    ldr w10, [sp, #88]
+; CHECK-NEXT:    zip1 z1.b, z1.b, z0.b
+; CHECK-NEXT:    fmov s23, w11
+; CHECK-NEXT:    ldr w11, [sp, #72]
+; CHECK-NEXT:    zip1 z0.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z17.b, z18.b, z17.b
+; CHECK-NEXT:    zip1 z18.b, z20.b, z19.b
+; CHECK-NEXT:    fmov s19, w8
+; CHECK-NEXT:    fmov s20, w9
 ; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    strb w9, [sp, #20]
-; CHECK-NEXT:    ldr w9, [sp, #112]
-; CHECK-NEXT:    strb w8, [sp, #21]
-; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    strb w6, [sp, #5]
-; CHECK-NEXT:    strb w8, [sp, #19]
-; CHECK-NEXT:    ldr w8, [sp, #104]
-; CHECK-NEXT:    strb w5, [sp, #4]
-; CHECK-NEXT:    strb w4, [sp, #3]
-; CHECK-NEXT:    strb w3, [sp, #2]
-; CHECK-NEXT:    strb w2, [sp, #1]
-; CHECK-NEXT:    strb w1, [sp]
-; CHECK-NEXT:    strb w10, [sp, #18]
-; CHECK-NEXT:    strb w9, [sp, #17]
-; CHECK-NEXT:    strb w8, [sp, #16]
+; CHECK-NEXT:    ldr w9, [sp, #128]
+; CHECK-NEXT:    fmov s24, w10
+; CHECK-NEXT:    fmov s5, w7
+; CHECK-NEXT:    fmov s25, w11
+; CHECK-NEXT:    fmov s22, w9
+; CHECK-NEXT:    ldr w9, [sp, #104]
+; CHECK-NEXT:    zip1 z2.b, z16.b, z7.b
+; CHECK-NEXT:    zip1 z19.b, z20.b, z19.b
+; CHECK-NEXT:    fmov s20, w8
+; CHECK-NEXT:    ldr w8, [sp, #112]
+; CHECK-NEXT:    zip1 z3.b, z5.b, z3.b
+; CHECK-NEXT:    fmov s5, w6
+; CHECK-NEXT:    fmov s6, w5
+; CHECK-NEXT:    fmov s7, w4
+; CHECK-NEXT:    fmov s16, w3
+; CHECK-NEXT:    zip1 z1.h, z4.h, z1.h
+; CHECK-NEXT:    zip1 z20.b, z21.b, z20.b
+; CHECK-NEXT:    zip1 z21.b, z23.b, z22.b
+; CHECK-NEXT:    fmov s22, w8
+; CHECK-NEXT:    fmov s23, w9
+; CHECK-NEXT:    ldr w8, [sp, #96]
+; CHECK-NEXT:    ldr w9, [sp, #80]
+; CHECK-NEXT:    zip1 z5.b, z6.b, z5.b
+; CHECK-NEXT:    zip1 z6.b, z16.b, z7.b
+; CHECK-NEXT:    zip1 z4.h, z18.h, z17.h
+; CHECK-NEXT:    zip1 z16.h, z20.h, z19.h
+; CHECK-NEXT:    zip1 z0.h, z2.h, z0.h
+; CHECK-NEXT:    zip1 z22.b, z23.b, z22.b
+; CHECK-NEXT:    fmov s23, w8
 ; CHECK-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    zip1 z2.h, z5.h, z3.h
+; CHECK-NEXT:    zip1 z4.s, z16.s, z4.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT:    fmov s24, w9
+; CHECK-NEXT:    zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT:    zip1 z24.b, z25.b, z24.b
+; CHECK-NEXT:    fmov s25, w1
+; CHECK-NEXT:    zip1 z7.b, z25.b, z26.b
+; CHECK-NEXT:    zip1 z18.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT:    zip1 z5.s, z18.s, z17.s
+; CHECK-NEXT:    zip1 z1.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    lsl z1.b, z2.b, #7
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
-; CHECK-NEXT:    lsl z1.b, z1.b, #7
-; CHECK-NEXT:    asr z0.b, z0.b, #7
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
-; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    asr z0.b, z0.b, #7
+; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p1, [x0, x8]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:
@@ -589,23 +615,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    fmov s1, wzr
+; CHECK-NEXT:    mov z2.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f16:
@@ -1014,48 +1035,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z4.b, z0.b[4]
+; CHECK-NEXT:    mov z5.b, z0.b[3]
+; CHECK-NEXT:    mov z6.b, z0.b[2]
+; CHECK-NEXT:    mov z7.b, z0.b[1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z4.b, z0.b[1]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z7.h
+; CHECK-NEXT:    zip1 z1.s, z2.s, z1.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
-; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr d0, [sp]
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index b91f813c5141bb..620e791c77e89f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    mov z3.b, z0.b[14]
-; CHECK-NEXT:    mov z4.b, z0.b[13]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.b, z0.b[11]
-; CHECK-NEXT:    mov z2.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[10]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    mov z4.b, z0.b[14]
+; CHECK-NEXT:    mov z6.b, z0.b[13]
+; CHECK-NEXT:    mov z3.b, z1.b[15]
+; CHECK-NEXT:    mov z5.b, z1.b[14]
+; CHECK-NEXT:    mov z7.b, z1.b[13]
+; CHECK-NEXT:    mov z16.b, z0.b[12]
+; CHECK-NEXT:    mov z17.b, z1.b[12]
+; CHECK-NEXT:    mov z18.b, z0.b[11]
+; CHECK-NEXT:    mov z19.b, z1.b[11]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    mov z21.b, z1.b[10]
+; CHECK-NEXT:    mov z22.b, z0.b[9]
+; CHECK-NEXT:    mov z23.b, z1.b[9]
+; CHECK-NEXT:    mov z24.b, z0.b[8]
+; CHECK-NEXT:    mov z25.b, z1.b[8]
+; CHECK-NEXT:    zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT:    zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT:    zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT:    zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT:    zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT:    zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT:    zip1 z16.b, z22.b, z23.b
 ; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[9]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[8]
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z4.h, z7.h, z6.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v32i8:
@@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q1, q3, [x1]
-; CHECK-NEXT:    ldp q0, q4, [x0]
-; CHECK-NEXT:    ldp q2, q5, [x0, #32]
-; CHECK-NEXT:    mov z16.h, z3.h[7]
-; CHECK-NEXT:    mov z18.h, z3.h[6]
-; CHECK-NEXT:    mov z17.h, z4.h[7]
-; CHECK-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-NEXT:    mov z19.h, z4.h[6]
-; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    .cfi_offset b8, -8
+; CHECK-NEXT:    .cfi_offset b9, -16
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -40
+; CHECK-NEXT:    .cfi_offset b13, -48
+; CHECK-NEXT:    .cfi_offset b14, -56
+; CHECK-NEXT:    .cfi_offset b15, -64
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEXT:    mov z5.h, z1.h[7]
+; CHECK-NEXT:    mov z7.h, z1.h[6]
+; CHECK-NEXT:    mov z17.h, z1.h[5]
+; CHECK-NEXT:    mov z4.h, z3.h[7]
+; CHECK-NEXT:    mov z6.h, z3.h[6]
 ; CHECK-NEXT:    mov z16.h, z3.h[5]
-; CHECK-NEXT:    fmov w9, s17
-; CHECK-NEXT:    mov z17.h, z4.h[5]
-; CHECK-NEXT:    mov z20.h, z7.h[6]
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z20.h, z2.h[7]
+; CHECK-NEXT:    mov z21.h, z0.h[7]
 ; CHECK-NEXT:    mov z18.h, z3.h[4]
-; CHECK-NEXT:    strh w9, [sp, #28]
-; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    mov z19.h, z5.h[7]
-; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z4.h[4]
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    zip1 z4.h, z5.h, z7.h
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z1.h[7]
-; CHECK-NEXT:    add z3.h, z3.h, z4.h
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[7]
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z1.h[6]
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z17.h, z1.h[4]
-; CHECK-NEXT:    strh w8, [sp, #56]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[4]
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z1.h, z2.h, z6.h
-; CHECK-NEXT:    strh w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    ldr q16, [sp, #16]
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strh w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z7.h[7]
-; CHECK-NEXT:    strh w8, [sp, #48]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z5.h[6]
-; CHECK-NEXT:    ldr q17, [sp, #48]
-; CHECK-NEXT:    strh w8, [sp, #46]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z7.h[5]
-; CHECK-NEXT:    strh w8, [sp, #44]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z5.h[5]
-; CHECK-NEXT:    strh w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z7.h[4]
-; CHECK-NEXT:    strh w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z5.h[4]
-; CHECK-NEXT:    strh w8, [sp, #38]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z6.h[7]
-; CHECK-NEXT:    strh w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z2.h[7]
-; CHECK-NEXT:    strh w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z6.h[6]
-; CHECK-NEXT:    strh w8, [sp, #32]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z2.h[6]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z6.h[5]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z2.h[5]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z6.h[4]
-; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z2.h[4]
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    ldr q2, [sp, #32]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    add z2.h, z16.h, z2.h
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    ldr q4, [sp]
-; CHECK-NEXT:    stp q3, q2, [x0, #32]
-; CHECK-NEXT:    add z1.h, z17.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov z19.h, z1.h[4]
+; CHECK-NEXT:    mov z22.h, z2.h[6]
+; CHECK-NEXT:    mov z23.h, z0.h[6]
+; CHECK-NEXT:    zip1 z24.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z25.h, z7.h, z6.h
+; CHECK-NEXT:    zip1 z17.h, z17.h, z16.h
+; CHECK-NEXT:    ldp q4, q6, [x0, #32]
+; CHECK-NEXT:    zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT:    ldp q5, q7, [x1, #32]
+; CHECK-NEXT:    zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z19.s, z25.s, z24.s
+; CHECK-NEXT:    zip1 z22.h, z23.h, z22.h
+; CHECK-NEXT:    mov z23.h, z2.h[5]
+; CHECK-NEXT:    mov z21.h, z6.h[7]
+; CHECK-NEXT:    mov z24.h, z0.h[5]
+; CHECK-NEXT:    mov z25.h, z2.h[4]
+; CHECK-NEXT:    mov z20.h, z7.h[7]
+; CHECK-NEXT:    mov z26.h, z0.h[4]
+; CHECK-NEXT:    mov z27.h, z6.h[6]
+; CHECK-NEXT:    mov z28.h, z7.h[5]
+; CHECK-NEXT:    mov z29.h, z6.h[5]
+; CHECK-NEXT:    mov z30.h, z7.h[4]
+; CHECK-NEXT:    mov z31.h, z6.h[4]
+; CHECK-NEXT:    mov z8.h, z5.h[7]
+; CHECK-NEXT:    mov z9.h, z4.h[7]
+; CHECK-NEXT:    zip1 z20.h, z21.h, z20.h
+; CHECK-NEXT:    mov z21.h, z7.h[6]
+; CHECK-NEXT:    mov z10.h, z5.h[6]
+; CHECK-NEXT:    mov z11.h, z4.h[6]
+; CHECK-NEXT:    mov z12.h, z5.h[5]
+; CHECK-NEXT:    mov z13.h, z4.h[5]
+; CHECK-NEXT:    mov z14.h, z5.h[4]
+; CHECK-NEXT:    mov z15.h, z4.h[4]
+; CHECK-NEXT:    zip1 z23.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z21.h, z27.h, z21.h
+; CHECK-NEXT:    zip1 z27.h, z29.h, z28.h
+; CHECK-NEXT:    zip1 z28.h, z31.h, z30.h
+; CHECK-NEXT:    zip1 z24.h, z26.h, z25.h
+; CHECK-NEXT:    zip1 z25.h, z9.h, z8.h
+; CHECK-NEXT:    zip1 z26.h, z11.h, z10.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z29.h, z13.h, z12.h
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z30.h, z15.h, z14.h
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z17.s, z18.s, z17.s
+; CHECK-NEXT:    zip1 z18.s, z21.s, z20.s
+; CHECK-NEXT:    zip1 z20.s, z28.s, z27.s
+; CHECK-NEXT:    zip1 z16.s, z22.s, z16.s
+; CHECK-NEXT:    zip1 z21.s, z24.s, z23.s
+; CHECK-NEXT:    zip1 z1.h, z1.h, z3.h
+; CHECK-NEXT:    zip1 z3.s, z26.s, z25.s
+; CHECK-NEXT:    zip1 z22.s, z30.s, z29.s
+; CHECK-NEXT:    zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z7.d, z17.d, z19.d
+; CHECK-NEXT:    zip1 z17.d, z20.d, z18.d
+; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z2.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.d, z21.d, z16.d
+; CHECK-NEXT:    zip1 z3.d, z22.d, z3.d
+; CHECK-NEXT:    add z1.h, z1.h, z6.h
+; CHECK-NEXT:    add z5.h, z7.h, z17.h
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-NEXT:    add z2.h, z4.h, z3.h
+; CHECK-NEXT:    stp q1, q5, [x0, #32]
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_v32i16:
@@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    mov z3.h, z0.h[6]
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov z4.h, z0.h[6]
+; CHECK-NEXT:    mov z6.h, z0.h[5]
 ; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z7.h, z1.h[5]
+; CHECK-NEXT:    mov z16.h, z0.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[4]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z17.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v16i16:
@@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip1_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
@@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z4.s, z5.s
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32:
@@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: zip_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w9, w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w9, w8, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_v4i32:
@@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) {
 define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    mov z3.b, z0.b[14]
-; CHECK-NEXT:    mov z4.b, z0.b[13]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.b, z0.b[11]
-; CHECK-NEXT:    mov z2.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[10]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    mov z4.b, z0.b[14]
+; CHECK-NEXT:    mov z6.b, z0.b[13]
+; CHECK-NEXT:    mov z3.b, z1.b[15]
+; CHECK-NEXT:    mov z5.b, z1.b[14]
+; CHECK-NEXT:    mov z7.b, z1.b[13]
+; CHECK-NEXT:    mov z16.b, z0.b[12]
+; CHECK-NEXT:    mov z17.b, z1.b[12]
+; CHECK-NEXT:    mov z18.b, z0.b[11]
+; CHECK-NEXT:    mov z19.b, z1.b[11]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    mov z21.b, z1.b[10]
+; CHECK-NEXT:    mov z22.b, z0.b[9]
+; CHECK-NEXT:    mov z23.b, z1.b[9]
+; CHECK-NEXT:    mov z24.b, z0.b[8]
+; CHECK-NEXT:    mov z25.b, z1.b[8]
+; CHECK-NEXT:    zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT:    zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT:    zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT:    zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT:    zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT:    zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT:    zip1 z16.b, z22.b, z23.b
 ; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[9]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[8]
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z4.h, z7.h, z6.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v32i8:
@@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    mov z3.h, z0.h[6]
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov z4.h, z0.h[6]
+; CHECK-NEXT:    mov z6.h, z0.h[5]
 ; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z7.h, z1.h[5]
+; CHECK-NEXT:    mov z16.h, z0.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[4]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[5]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z17.h
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v16i16:
@@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: zip2_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
@@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z4.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z4.s, z5.s
 ; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    str q1, [x0, #16]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32:
@@ -1547,197 +1432,144 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str d14, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov z4.b, z3.b[14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.b, z3.b[10]
-; CHECK-NEXT:    mov z5.b, z3.b[12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z7.b, z3.b[8]
-; CHECK-NEXT:    mov z17.b, z3.b[9]
-; CHECK-NEXT:    mov z18.b, z3.b[7]
-; CHECK-NEXT:    mov z16.b, z3.b[11]
-; CHECK-NEXT:    strb w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z3.b[6]
-; CHECK-NEXT:    strb w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.b, z3.b[4]
-; CHECK-NEXT:    strb w8, [sp, #47]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[2]
-; CHECK-NEXT:    strb w9, [sp, #46]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    mov z7.b, z2.b[14]
-; CHECK-NEXT:    strb w8, [sp, #45]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z2.b[12]
-; CHECK-NEXT:    strb w9, [sp, #44]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z16.b, z2.b[11]
-; CHECK-NEXT:    strb w8, [sp, #43]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z2.b[10]
-; CHECK-NEXT:    strb w9, [sp, #61]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    strb w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z2.b[8]
-; CHECK-NEXT:    strb w9, [sp, #53]
-; CHECK-NEXT:    strb w8, [sp, #41]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[6]
-; CHECK-NEXT:    strb w8, [sp, #39]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z2.b[4]
-; CHECK-NEXT:    strb w8, [sp, #38]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z2.b[2]
-; CHECK-NEXT:    strb w8, [sp, #37]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[10]
-; CHECK-NEXT:    strb w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[8]
-; CHECK-NEXT:    strb w8, [sp, #35]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[14]
-; CHECK-NEXT:    strb w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z1.b[12]
-; CHECK-NEXT:    strb w8, [sp, #33]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z1.b[6]
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z1.b[4]
-; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[2]
-; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z0.b[14]
-; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[12]
-; CHECK-NEXT:    strb w8, [sp, #11]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z0.b[10]
-; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[8]
-; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z0.b[6]
-; CHECK-NEXT:    strb w8, [sp, #7]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.b, z0.b[4]
-; CHECK-NEXT:    strb w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.b, z0.b[2]
-; CHECK-NEXT:    strb w8, [sp, #5]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[15]
-; CHECK-NEXT:    strb w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z3.b[13]
-; CHECK-NEXT:    strb w8, [sp, #3]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    ldr q4, [sp, #32]
-; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z3.b[5]
-; CHECK-NEXT:    mov z3.b, z3.b[3]
-; CHECK-NEXT:    ldr q5, [sp]
-; CHECK-NEXT:    strb w8, [sp, #63]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[13]
-; CHECK-NEXT:    strb w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strb w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    strb w8, [sp, #59]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z2.b[9]
-; CHECK-NEXT:    strb w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z2.b[5]
-; CHECK-NEXT:    strb w8, [sp, #57]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z2.b[3]
+; CHECK-NEXT:    .cfi_offset b8, -8
+; CHECK-NEXT:    .cfi_offset b9, -16
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -40
+; CHECK-NEXT:    .cfi_offset b13, -48
+; CHECK-NEXT:    .cfi_offset b14, -64
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z2.b, z1.b[14]
+; CHECK-NEXT:    mov z3.b, z1.b[12]
+; CHECK-NEXT:    mov z4.b, z1.b[10]
+; CHECK-NEXT:    mov z5.b, z1.b[8]
+; CHECK-NEXT:    mov z7.b, z1.b[6]
+; CHECK-NEXT:    mov z16.b, z1.b[4]
+; CHECK-NEXT:    mov z17.b, z1.b[2]
+; CHECK-NEXT:    mov z18.b, z0.b[14]
+; CHECK-NEXT:    mov z20.b, z0.b[12]
+; CHECK-NEXT:    zip1 z3.b, z3.b, z2.b
+; CHECK-NEXT:    ldp q2, q19, [x1]
+; CHECK-NEXT:    zip1 z6.b, z5.b, z4.b
+; CHECK-NEXT:    zip1 z4.b, z16.b, z7.b
+; CHECK-NEXT:    mov z16.b, z0.b[10]
+; CHECK-NEXT:    zip1 z5.b, z1.b, z17.b
+; CHECK-NEXT:    zip1 z7.b, z20.b, z18.b
+; CHECK-NEXT:    mov z17.b, z0.b[8]
+; CHECK-NEXT:    mov z18.b, z0.b[6]
+; CHECK-NEXT:    mov z20.b, z0.b[4]
+; CHECK-NEXT:    mov z21.b, z0.b[2]
+; CHECK-NEXT:    mov z22.b, z19.b[14]
+; CHECK-NEXT:    mov z23.b, z19.b[12]
+; CHECK-NEXT:    mov z24.b, z19.b[10]
+; CHECK-NEXT:    mov z25.b, z19.b[8]
+; CHECK-NEXT:    mov z26.b, z19.b[6]
+; CHECK-NEXT:    mov z27.b, z19.b[4]
+; CHECK-NEXT:    mov z28.b, z19.b[2]
+; CHECK-NEXT:    mov z29.b, z2.b[14]
+; CHECK-NEXT:    mov z30.b, z2.b[12]
+; CHECK-NEXT:    mov z31.b, z2.b[10]
+; CHECK-NEXT:    mov z8.b, z2.b[8]
+; CHECK-NEXT:    zip1 z16.b, z17.b, z16.b
+; CHECK-NEXT:    zip1 z17.b, z20.b, z18.b
+; CHECK-NEXT:    zip1 z18.b, z0.b, z21.b
+; CHECK-NEXT:    zip1 z20.b, z23.b, z22.b
+; CHECK-NEXT:    zip1 z21.b, z25.b, z24.b
+; CHECK-NEXT:    zip1 z22.b, z27.b, z26.b
+; CHECK-NEXT:    zip1 z23.b, z19.b, z28.b
+; CHECK-NEXT:    zip1 z24.b, z30.b, z29.b
+; CHECK-NEXT:    zip1 z25.b, z8.b, z31.b
+; CHECK-NEXT:    zip1 z3.h, z6.h, z3.h
+; CHECK-NEXT:    zip1 z4.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT:    zip1 z7.h, z18.h, z17.h
+; CHECK-NEXT:    zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z17.h, z23.h, z22.h
+; CHECK-NEXT:    mov z21.b, z19.b[15]
+; CHECK-NEXT:    zip1 z6.h, z25.h, z24.h
+; CHECK-NEXT:    mov z22.b, z19.b[13]
+; CHECK-NEXT:    mov z23.b, z19.b[11]
+; CHECK-NEXT:    mov z24.b, z19.b[9]
+; CHECK-NEXT:    mov z26.b, z2.b[6]
+; CHECK-NEXT:    mov z27.b, z2.b[4]
+; CHECK-NEXT:    mov z20.b, z2.b[2]
+; CHECK-NEXT:    mov z25.b, z19.b[7]
+; CHECK-NEXT:    mov z28.b, z19.b[1]
+; CHECK-NEXT:    zip1 z21.b, z22.b, z21.b
+; CHECK-NEXT:    mov z29.b, z2.b[15]
+; CHECK-NEXT:    mov z30.b, z2.b[13]
+; CHECK-NEXT:    zip1 z22.b, z24.b, z23.b
+; CHECK-NEXT:    mov z23.b, z1.b[15]
+; CHECK-NEXT:    mov z24.b, z1.b[13]
+; CHECK-NEXT:    zip1 z18.b, z27.b, z26.b
+; CHECK-NEXT:    mov z26.b, z19.b[5]
+; CHECK-NEXT:    mov z27.b, z19.b[3]
+; CHECK-NEXT:    mov z31.b, z1.b[9]
+; CHECK-NEXT:    zip1 z20.b, z2.b, z20.b
+; CHECK-NEXT:    mov z8.b, z1.b[7]
+; CHECK-NEXT:    zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT:    mov z24.b, z1.b[11]
+; CHECK-NEXT:    mov z9.b, z1.b[5]
+; CHECK-NEXT:    zip1 z19.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z25.b, z28.b, z27.b
+; CHECK-NEXT:    zip1 z26.b, z30.b, z29.b
+; CHECK-NEXT:    mov z27.b, z2.b[11]
+; CHECK-NEXT:    mov z28.b, z2.b[9]
+; CHECK-NEXT:    mov z29.b, z2.b[7]
+; CHECK-NEXT:    mov z30.b, z2.b[5]
+; CHECK-NEXT:    mov z10.b, z0.b[11]
+; CHECK-NEXT:    mov z11.b, z0.b[9]
+; CHECK-NEXT:    mov z12.b, z0.b[3]
+; CHECK-NEXT:    mov z13.b, z0.b[1]
+; CHECK-NEXT:    mov z1.b, z1.b[3]
+; CHECK-NEXT:    mov z14.b, z0.b[13]
+; CHECK-NEXT:    mov z0.b, z0.b[5]
+; CHECK-NEXT:    zip1 z24.b, z31.b, z24.b
+; CHECK-NEXT:    mov z31.b, z2.b[3]
 ; CHECK-NEXT:    mov z2.b, z2.b[1]
-; CHECK-NEXT:    strb w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[15]
-; CHECK-NEXT:    strb w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[13]
-; CHECK-NEXT:    strb w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[11]
-; CHECK-NEXT:    strb w8, [sp, #49]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[9]
-; CHECK-NEXT:    strb w8, [sp, #48]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z1.b[7]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[15]
-; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z1.b[5]
-; CHECK-NEXT:    strb w9, [sp, #28]
-; CHECK-NEXT:    strb w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.b, z1.b[3]
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    strb w8, [sp, #29]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[11]
-; CHECK-NEXT:    strb w8, [sp, #27]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[13]
-; CHECK-NEXT:    strb w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    strb w8, [sp, #25]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[9]
-; CHECK-NEXT:    strb w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[7]
-; CHECK-NEXT:    strb w8, [sp, #23]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.b, z0.b[5]
-; CHECK-NEXT:    strb w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.b, z0.b[3]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    strb w8, [sp, #21]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strb w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w8, [sp, #19]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strb w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strb w8, [sp, #17]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    add z0.b, z4.b, z0.b
-; CHECK-NEXT:    strb w8, [sp, #16]
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    add z1.b, z5.b, z1.b
+; CHECK-NEXT:    zip1 z8.b, z9.b, z8.b
+; CHECK-NEXT:    zip1 z9.b, z11.b, z10.b
+; CHECK-NEXT:    zip1 z10.b, z13.b, z12.b
+; CHECK-NEXT:    zip1 z27.b, z28.b, z27.b
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    zip1 z11.b, z14.b, z0.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    zip1 z28.b, z30.b, z29.b
+; CHECK-NEXT:    zip1 z2.b, z2.b, z31.b
+; CHECK-NEXT:    zip1 z18.h, z20.h, z18.h
+; CHECK-NEXT:    zip1 z20.h, z22.h, z21.h
+; CHECK-NEXT:    zip1 z21.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z8.h
+; CHECK-NEXT:    zip1 z19.h, z25.h, z19.h
+; CHECK-NEXT:    zip1 z22.h, z9.h, z11.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z0.h, z10.h, z0.h
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z23.h, z27.h, z26.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT:    zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT:    zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT:    zip1 z5.s, z17.s, z16.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z21.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z22.s
+; CHECK-NEXT:    zip1 z6.s, z18.s, z6.s
+; CHECK-NEXT:    zip1 z7.s, z19.s, z20.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z23.s
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z1.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z2.d, z2.d, z7.d
+; CHECK-NEXT:    add z0.b, z3.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ldr d14, [sp], #64 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1922,110 +1754,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov z4.h, z3.h[6]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.h, z3.h[2]
-; CHECK-NEXT:    mov z5.h, z3.h[4]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z7.h, z2.h[6]
-; CHECK-NEXT:    mov z17.h, z2.h[7]
-; CHECK-NEXT:    mov z16.h, z3.h[1]
-; CHECK-NEXT:    strh w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z2.h[4]
-; CHECK-NEXT:    strh w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.h, z2.h[2]
-; CHECK-NEXT:    strh w8, [sp, #46]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    strh w9, [sp, #44]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    mov z7.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #42]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[6]
-; CHECK-NEXT:    strh w9, [sp, #38]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    strh w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp, #56]
-; CHECK-NEXT:    strh w8, [sp, #34]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z0.h[4]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[2]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z3.h[7]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.h, z3.h[5]
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z3.h[3]
-; CHECK-NEXT:    ldr q3, [sp, #32]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z2.h[5]
-; CHECK-NEXT:    ldr q4, [sp]
-; CHECK-NEXT:    strh w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    mov z7.h, z1.h[7]
-; CHECK-NEXT:    strh w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z2.h[3]
-; CHECK-NEXT:    mov z2.h, z2.h[1]
-; CHECK-NEXT:    strh w8, [sp, #58]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    strh w8, [sp, #54]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z1.h[5]
-; CHECK-NEXT:    strh w9, [sp, #48]
-; CHECK-NEXT:    strh w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    ldp q1, q6, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x1]
+; CHECK-NEXT:    mov z3.h, z6.h[6]
+; CHECK-NEXT:    mov z4.h, z6.h[4]
+; CHECK-NEXT:    mov z5.h, z6.h[2]
+; CHECK-NEXT:    mov z7.h, z1.h[6]
+; CHECK-NEXT:    mov z16.h, z1.h[4]
+; CHECK-NEXT:    mov z17.h, z1.h[2]
+; CHECK-NEXT:    mov z18.h, z2.h[6]
+; CHECK-NEXT:    mov z19.h, z2.h[4]
+; CHECK-NEXT:    mov z20.h, z2.h[2]
+; CHECK-NEXT:    mov z21.h, z0.h[6]
+; CHECK-NEXT:    mov z22.h, z0.h[4]
+; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT:    zip1 z7.h, z1.h, z17.h
+; CHECK-NEXT:    zip1 z16.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z18.h, z2.h, z20.h
+; CHECK-NEXT:    mov z19.h, z0.h[2]
+; CHECK-NEXT:    zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT:    mov z20.h, z6.h[7]
+; CHECK-NEXT:    mov z21.h, z6.h[5]
+; CHECK-NEXT:    mov z22.h, z6.h[3]
+; CHECK-NEXT:    mov z6.h, z6.h[1]
+; CHECK-NEXT:    mov z23.h, z1.h[7]
+; CHECK-NEXT:    mov z24.h, z1.h[5]
+; CHECK-NEXT:    mov z25.h, z1.h[3]
 ; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    strh w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z6.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    add z0.h, z3.h, z0.h
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    add z1.h, z4.h, z1.h
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov z26.h, z2.h[7]
+; CHECK-NEXT:    mov z27.h, z2.h[5]
+; CHECK-NEXT:    mov z28.h, z2.h[3]
+; CHECK-NEXT:    mov z2.h, z2.h[1]
+; CHECK-NEXT:    mov z29.h, z0.h[7]
+; CHECK-NEXT:    mov z30.h, z0.h[5]
+; CHECK-NEXT:    mov z31.h, z0.h[3]
+; CHECK-NEXT:    mov z8.h, z0.h[1]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z19.h
+; CHECK-NEXT:    zip1 z19.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z6.h, z6.h, z22.h
+; CHECK-NEXT:    zip1 z20.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z25.h
+; CHECK-NEXT:    zip1 z21.h, z27.h, z26.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT:    zip1 z22.h, z30.h, z29.h
+; CHECK-NEXT:    zip1 z23.h, z8.h, z31.h
+; CHECK-NEXT:    zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT:    zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT:    zip1 z5.s, z18.s, z16.s
+; CHECK-NEXT:    zip1 z6.s, z6.s, z19.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z20.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z17.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z21.s
+; CHECK-NEXT:    zip1 z7.s, z23.s, z22.s
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z5.d
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT:    add z1.h, z3.h, z1.h
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v16i16:
@@ -2116,32 +1909,31 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
-; CHECK-NEXT:    ldp q4, q1, [x1]
+; CHECK-NEXT:    ldp q2, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z3.s, z0.s[2]
-; CHECK-NEXT:    mov z5.s, z1.s[2]
-; CHECK-NEXT:    stp s0, s3, [sp, #24]
-; CHECK-NEXT:    mov z3.s, z4.s[2]
-; CHECK-NEXT:    stp s5, s2, [sp, #12]
+; CHECK-NEXT:    mov z4.s, z0.s[2]
 ; CHECK-NEXT:    mov z5.s, z0.s[3]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    stp s3, s1, [sp, #4]
-; CHECK-NEXT:    mov z1.s, z2.s[1]
-; CHECK-NEXT:    str s5, [sp, #44]
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    str s0, [sp, #40]
-; CHECK-NEXT:    ldp q3, q2, [sp]
-; CHECK-NEXT:    tbl z0.s, { z4.s }, z5.s
-; CHECK-NEXT:    str s1, [sp, #32]
-; CHECK-NEXT:    ldr q1, [sp, #32]
+; CHECK-NEXT:    mov z6.s, z0.s[1]
+; CHECK-NEXT:    mov z7.s, z1.s[1]
+; CHECK-NEXT:    mov z16.s, z3.s[2]
+; CHECK-NEXT:    mov z17.s, z2.s[2]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT:    zip1 z3.s, z3.s, z16.s
+; CHECK-NEXT:    tbl z2.s, { z2.s }, z6.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    zip1 z5.s, z7.s, z0.s
+; CHECK-NEXT:    zip1 z7.s, z0.s, z17.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z1.d, z5.d, z4.d
+; CHECK-NEXT:    zip1 z3.d, z7.d, z3.d
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    stp q1, q0, [x0]
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8f32:
@@ -2231,60 +2023,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[6]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z4.h, z1.h[2]
-; CHECK-NEXT:    mov z6.h, z0.h[4]
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z5.h, z0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.h, z1.h[7]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z1.h[5]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z5.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[7]
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    mov z4.h, z0.h[5]
-; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w8, [sp, #20]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #18]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    ldp q3, q0, [sp]
-; CHECK-NEXT:    add z0.h, z3.h, z0.h
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    mov z2.h, z0.h[6]
+; CHECK-NEXT:    mov z3.h, z0.h[4]
+; CHECK-NEXT:    mov z4.h, z0.h[2]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z6.h, z1.h[4]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z16.h, z0.h[7]
+; CHECK-NEXT:    mov z17.h, z0.h[5]
+; CHECK-NEXT:    mov z18.h, z0.h[3]
+; CHECK-NEXT:    mov z19.h, z0.h[1]
+; CHECK-NEXT:    mov z20.h, z1.h[7]
+; CHECK-NEXT:    mov z21.h, z1.h[5]
+; CHECK-NEXT:    mov z22.h, z1.h[3]
+; CHECK-NEXT:    mov z23.h, z1.h[1]
+; CHECK-NEXT:    zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z7.h
+; CHECK-NEXT:    zip1 z4.h, z17.h, z16.h
+; CHECK-NEXT:    zip1 z5.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z6.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i16:
@@ -2341,31 +2111,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-LABEL: uzp_v8i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z3.s, z1.s[2]
-; CHECK-NEXT:    mov z4.s, z0.s[3]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.s, z1.s[3]
-; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    stp w8, w9, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    stp w9, w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    stp w9, w8, [sp, #16]
-; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z2.s, z1.s[2]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z4.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[1]
+; CHECK-NEXT:    mov z6.s, z0.s[3]
+; CHECK-NEXT:    mov z7.s, z0.s[1]
+; CHECK-NEXT:    zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT:    zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q0, [x0]
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 88c83a214c7394..c942f1eca8ebaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, z0.s[3]
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.s, z0.s[2]
 ; CHECK-NEXT:    mov z3.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %el0 = extractelement <vscale x 4 x i1> %a, i32 0
   %el1 = extractelement <vscale x 4 x i1> %a, i32 1

>From b852c6152e3bdbc3a6dcf9ca1e600ebae0cd08b6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 9 Oct 2024 18:17:10 +0000
Subject: [PATCH 2/4] Handle UNDEF better

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 +-
 .../sve-fixed-length-vector-shuffle-tbl.ll    |  27 +-
 ...sve-streaming-mode-fixed-length-bitcast.ll |   2 -
 ...-streaming-mode-fixed-length-fp-vselect.ll |   8 +-
 ...ing-mode-fixed-length-insert-vector-elt.ll |   2 -
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  20 +-
 ...-streaming-mode-fixed-length-ld2-alloca.ll |   3 +-
 ...streaming-mode-fixed-length-masked-load.ll |   6 +-
 ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 260 +++++++++---------
 9 files changed, 162 insertions(+), 181 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6c1c33da4be996..71115705407bd6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14408,16 +14408,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
   SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
   SmallVector<SDValue, 16> Intermediates =
       llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
-        return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
-                           DAG.getUNDEF(ZipVT), Op, ZeroI64);
+        SDValue Undef = DAG.getUNDEF(ZipVT);
+        return Op.isUndef() ? Undef
+                            : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
+                                          Undef, Op, ZeroI64);
       });
 
   while (Intermediates.size() > 1) {
     auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
     for (unsigned I = 0; I < Intermediates.size(); I += 2) {
-      SDValue Op0 = ToZipVT(Intermediates[I + 0]);
-      SDValue Op1 = ToZipVT(Intermediates[I + 1]);
-      Intermediates[I / 2] = DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+      SDValue Op0 = Intermediates[I + 0];
+      SDValue Op1 = Intermediates[I + 1];
+      Intermediates[I / 2] = Op1.isUndef()
+                                 ? Op0
+                                 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT,
+                                               ToZipVT(Op0), ToZipVT(Op1));
     }
 
     Intermediates.resize(Intermediates.size() / 2);
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index a22c00c1ebce14..20659cde83ee00 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -236,14 +236,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
 ; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
 ; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[6]
 ; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
 ; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
-; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
 ; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
 ; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
 ; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
@@ -256,14 +255,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[6]
 ; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
 ; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
 ; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
 ; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
 ; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
@@ -276,14 +274,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
 ; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[6]
 ; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
 ; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z5.b, z0.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
 ; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
 ; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
 ; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 172e2454d70283..6644be11a02ba7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,6 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index e3c89981cb27af..ad5f91a5f39a49 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -9,13 +9,11 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-LABEL: select_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
-; CHECK-NEXT:    mov z4.s, z2.s[1]
-; CHECK-NEXT:    zip1 z3.h, z0.h, z0.h
+; CHECK-NEXT:    mov z3.s, z2.s[1]
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    zip1 z2.h, z2.h, z4.h
-; CHECK-NEXT:    zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z2.h, z2.h, z3.h
 ; CHECK-NEXT:    lsl z2.h, z2.h, #15
 ; CHECK-NEXT:    asr z2.h, z2.h, #15
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 87e3d0d09817ba..275d13ebfd9491 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -509,8 +509,6 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    fmov h1, #5.00000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index f71bfb770b15f4..f9f70d30a757eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1161,14 +1161,12 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z0.d[1]
+; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    fmov x9, d2
+; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    ucvtf h2, x9
-; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2617,14 +2615,12 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-LABEL: scvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z0.d[1]
+; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    fmov x9, d2
+; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    scvtf h2, x9
-; CHECK-NEXT:    zip1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index ef6b1c9acbf105..613543310f2c31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -70,8 +70,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    mov z2.b, z0.b[1]
 ; CHECK-NEXT:    mov z0.b, z0.b[5]
 ; CHECK-NEXT:    zip1 z1.h, z2.h, z1.h
-; CHECK-NEXT:    zip1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z0.s
 ; CHECK-NEXT:    st1b { z1.h }, p0, [x8]
 ; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x8]
 ; CHECK-NEXT:    fmov w8, s0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 1b90aed22f9d8d..4980ee4d7f74b7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -2695,10 +2695,9 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    fmov s0, w2
 ; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
-; CHECK-NEXT:    fmov s2, w3
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT:    zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT:    fmov s1, w3
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
@@ -2769,10 +2768,9 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    fmov s0, w2
 ; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    fmov s2, w3
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT:    zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT:    fmov s1, w3
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 620e791c77e89f..8b296d9fbc215d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1432,136 +1432,131 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str d14, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stp d13, d12, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset b8, -8
 ; CHECK-NEXT:    .cfi_offset b9, -16
 ; CHECK-NEXT:    .cfi_offset b10, -24
 ; CHECK-NEXT:    .cfi_offset b11, -32
 ; CHECK-NEXT:    .cfi_offset b12, -40
 ; CHECK-NEXT:    .cfi_offset b13, -48
-; CHECK-NEXT:    .cfi_offset b14, -64
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z2.b, z1.b[14]
 ; CHECK-NEXT:    mov z3.b, z1.b[12]
 ; CHECK-NEXT:    mov z4.b, z1.b[10]
 ; CHECK-NEXT:    mov z5.b, z1.b[8]
-; CHECK-NEXT:    mov z7.b, z1.b[6]
-; CHECK-NEXT:    mov z16.b, z1.b[4]
-; CHECK-NEXT:    mov z17.b, z1.b[2]
+; CHECK-NEXT:    mov z6.b, z1.b[6]
+; CHECK-NEXT:    mov z7.b, z1.b[4]
+; CHECK-NEXT:    mov z16.b, z1.b[2]
 ; CHECK-NEXT:    mov z18.b, z0.b[14]
-; CHECK-NEXT:    mov z20.b, z0.b[12]
+; CHECK-NEXT:    mov z19.b, z0.b[12]
 ; CHECK-NEXT:    zip1 z3.b, z3.b, z2.b
-; CHECK-NEXT:    ldp q2, q19, [x1]
-; CHECK-NEXT:    zip1 z6.b, z5.b, z4.b
-; CHECK-NEXT:    zip1 z4.b, z16.b, z7.b
-; CHECK-NEXT:    mov z16.b, z0.b[10]
-; CHECK-NEXT:    zip1 z5.b, z1.b, z17.b
-; CHECK-NEXT:    zip1 z7.b, z20.b, z18.b
-; CHECK-NEXT:    mov z17.b, z0.b[8]
-; CHECK-NEXT:    mov z18.b, z0.b[6]
-; CHECK-NEXT:    mov z20.b, z0.b[4]
-; CHECK-NEXT:    mov z21.b, z0.b[2]
-; CHECK-NEXT:    mov z22.b, z19.b[14]
-; CHECK-NEXT:    mov z23.b, z19.b[12]
-; CHECK-NEXT:    mov z24.b, z19.b[10]
-; CHECK-NEXT:    mov z25.b, z19.b[8]
-; CHECK-NEXT:    mov z26.b, z19.b[6]
-; CHECK-NEXT:    mov z27.b, z19.b[4]
-; CHECK-NEXT:    mov z28.b, z19.b[2]
-; CHECK-NEXT:    mov z29.b, z2.b[14]
-; CHECK-NEXT:    mov z30.b, z2.b[12]
-; CHECK-NEXT:    mov z31.b, z2.b[10]
-; CHECK-NEXT:    mov z8.b, z2.b[8]
-; CHECK-NEXT:    zip1 z16.b, z17.b, z16.b
-; CHECK-NEXT:    zip1 z17.b, z20.b, z18.b
-; CHECK-NEXT:    zip1 z18.b, z0.b, z21.b
-; CHECK-NEXT:    zip1 z20.b, z23.b, z22.b
-; CHECK-NEXT:    zip1 z21.b, z25.b, z24.b
-; CHECK-NEXT:    zip1 z22.b, z27.b, z26.b
-; CHECK-NEXT:    zip1 z23.b, z19.b, z28.b
-; CHECK-NEXT:    zip1 z24.b, z30.b, z29.b
-; CHECK-NEXT:    zip1 z25.b, z8.b, z31.b
-; CHECK-NEXT:    zip1 z3.h, z6.h, z3.h
-; CHECK-NEXT:    zip1 z4.h, z5.h, z4.h
-; CHECK-NEXT:    zip1 z5.h, z16.h, z7.h
-; CHECK-NEXT:    zip1 z7.h, z18.h, z17.h
-; CHECK-NEXT:    zip1 z16.h, z21.h, z20.h
-; CHECK-NEXT:    zip1 z17.h, z23.h, z22.h
-; CHECK-NEXT:    mov z21.b, z19.b[15]
-; CHECK-NEXT:    zip1 z6.h, z25.h, z24.h
-; CHECK-NEXT:    mov z22.b, z19.b[13]
-; CHECK-NEXT:    mov z23.b, z19.b[11]
-; CHECK-NEXT:    mov z24.b, z19.b[9]
-; CHECK-NEXT:    mov z26.b, z2.b[6]
-; CHECK-NEXT:    mov z27.b, z2.b[4]
-; CHECK-NEXT:    mov z20.b, z2.b[2]
-; CHECK-NEXT:    mov z25.b, z19.b[7]
-; CHECK-NEXT:    mov z28.b, z19.b[1]
-; CHECK-NEXT:    zip1 z21.b, z22.b, z21.b
-; CHECK-NEXT:    mov z29.b, z2.b[15]
-; CHECK-NEXT:    mov z30.b, z2.b[13]
-; CHECK-NEXT:    zip1 z22.b, z24.b, z23.b
-; CHECK-NEXT:    mov z23.b, z1.b[15]
-; CHECK-NEXT:    mov z24.b, z1.b[13]
-; CHECK-NEXT:    zip1 z18.b, z27.b, z26.b
-; CHECK-NEXT:    mov z26.b, z19.b[5]
-; CHECK-NEXT:    mov z27.b, z19.b[3]
-; CHECK-NEXT:    mov z31.b, z1.b[9]
-; CHECK-NEXT:    zip1 z20.b, z2.b, z20.b
-; CHECK-NEXT:    mov z8.b, z1.b[7]
+; CHECK-NEXT:    ldp q2, q17, [x1]
+; CHECK-NEXT:    mov z20.b, z0.b[10]
+; CHECK-NEXT:    zip1 z4.b, z5.b, z4.b
+; CHECK-NEXT:    zip1 z5.b, z7.b, z6.b
+; CHECK-NEXT:    zip1 z6.b, z1.b, z16.b
+; CHECK-NEXT:    mov z7.b, z0.b[8]
+; CHECK-NEXT:    mov z16.b, z0.b[6]
+; CHECK-NEXT:    mov z21.b, z0.b[4]
+; CHECK-NEXT:    mov z22.b, z0.b[2]
+; CHECK-NEXT:    mov z23.b, z17.b[14]
+; CHECK-NEXT:    mov z24.b, z17.b[12]
+; CHECK-NEXT:    mov z25.b, z17.b[10]
+; CHECK-NEXT:    mov z26.b, z17.b[8]
+; CHECK-NEXT:    mov z27.b, z17.b[6]
+; CHECK-NEXT:    mov z28.b, z17.b[4]
+; CHECK-NEXT:    mov z29.b, z17.b[2]
+; CHECK-NEXT:    zip1 z18.b, z19.b, z18.b
+; CHECK-NEXT:    zip1 z7.b, z7.b, z20.b
+; CHECK-NEXT:    zip1 z16.b, z21.b, z16.b
+; CHECK-NEXT:    zip1 z19.b, z0.b, z22.b
+; CHECK-NEXT:    zip1 z20.b, z24.b, z23.b
+; CHECK-NEXT:    zip1 z21.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z22.b, z28.b, z27.b
+; CHECK-NEXT:    mov z24.b, z2.b[14]
+; CHECK-NEXT:    mov z25.b, z2.b[12]
+; CHECK-NEXT:    mov z26.b, z2.b[10]
+; CHECK-NEXT:    mov z27.b, z2.b[8]
+; CHECK-NEXT:    zip1 z23.b, z17.b, z29.b
+; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT:    zip1 z5.h, z7.h, z18.h
+; CHECK-NEXT:    zip1 z6.h, z19.h, z16.h
+; CHECK-NEXT:    zip1 z7.h, z21.h, z20.h
+; CHECK-NEXT:    zip1 z18.b, z25.b, z24.b
+; CHECK-NEXT:    zip1 z19.b, z27.b, z26.b
+; CHECK-NEXT:    mov z20.b, z2.b[6]
+; CHECK-NEXT:    mov z21.b, z2.b[4]
+; CHECK-NEXT:    mov z29.b, z17.b[3]
+; CHECK-NEXT:    mov z30.b, z17.b[1]
+; CHECK-NEXT:    mov z31.b, z2.b[15]
+; CHECK-NEXT:    mov z8.b, z2.b[13]
+; CHECK-NEXT:    zip1 z16.h, z23.h, z22.h
+; CHECK-NEXT:    mov z22.b, z2.b[2]
+; CHECK-NEXT:    mov z23.b, z17.b[15]
+; CHECK-NEXT:    mov z24.b, z17.b[13]
+; CHECK-NEXT:    mov z25.b, z17.b[11]
+; CHECK-NEXT:    mov z26.b, z17.b[9]
+; CHECK-NEXT:    mov z27.b, z17.b[7]
+; CHECK-NEXT:    mov z28.b, z17.b[5]
+; CHECK-NEXT:    zip1 z17.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z21.b, z21.b, z20.b
+; CHECK-NEXT:    zip1 z19.b, z30.b, z29.b
+; CHECK-NEXT:    zip1 z20.b, z8.b, z31.b
+; CHECK-NEXT:    mov z29.b, z1.b[15]
+; CHECK-NEXT:    mov z30.b, z1.b[13]
+; CHECK-NEXT:    mov z31.b, z1.b[11]
+; CHECK-NEXT:    mov z8.b, z1.b[9]
+; CHECK-NEXT:    zip1 z22.b, z2.b, z22.b
 ; CHECK-NEXT:    zip1 z23.b, z24.b, z23.b
-; CHECK-NEXT:    mov z24.b, z1.b[11]
-; CHECK-NEXT:    mov z9.b, z1.b[5]
-; CHECK-NEXT:    zip1 z19.b, z26.b, z25.b
-; CHECK-NEXT:    zip1 z25.b, z28.b, z27.b
-; CHECK-NEXT:    zip1 z26.b, z30.b, z29.b
-; CHECK-NEXT:    mov z27.b, z2.b[11]
-; CHECK-NEXT:    mov z28.b, z2.b[9]
-; CHECK-NEXT:    mov z29.b, z2.b[7]
-; CHECK-NEXT:    mov z30.b, z2.b[5]
-; CHECK-NEXT:    mov z10.b, z0.b[11]
-; CHECK-NEXT:    mov z11.b, z0.b[9]
-; CHECK-NEXT:    mov z12.b, z0.b[3]
-; CHECK-NEXT:    mov z13.b, z0.b[1]
+; CHECK-NEXT:    zip1 z24.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z18.b, z28.b, z27.b
+; CHECK-NEXT:    mov z25.b, z2.b[11]
+; CHECK-NEXT:    mov z26.b, z2.b[9]
+; CHECK-NEXT:    mov z27.b, z2.b[7]
+; CHECK-NEXT:    mov z28.b, z2.b[5]
+; CHECK-NEXT:    mov z9.b, z1.b[7]
+; CHECK-NEXT:    mov z10.b, z1.b[5]
 ; CHECK-NEXT:    mov z1.b, z1.b[3]
-; CHECK-NEXT:    mov z14.b, z0.b[13]
-; CHECK-NEXT:    mov z0.b, z0.b[5]
-; CHECK-NEXT:    zip1 z24.b, z31.b, z24.b
-; CHECK-NEXT:    mov z31.b, z2.b[3]
+; CHECK-NEXT:    mov z11.b, z0.b[11]
+; CHECK-NEXT:    mov z12.b, z0.b[9]
+; CHECK-NEXT:    zip1 z29.b, z30.b, z29.b
+; CHECK-NEXT:    mov z30.b, z0.b[3]
+; CHECK-NEXT:    mov z13.b, z0.b[1]
+; CHECK-NEXT:    zip1 z31.b, z8.b, z31.b
+; CHECK-NEXT:    mov z8.b, z2.b[3]
 ; CHECK-NEXT:    mov z2.b, z2.b[1]
-; CHECK-NEXT:    zip1 z8.b, z9.b, z8.b
-; CHECK-NEXT:    zip1 z9.b, z11.b, z10.b
-; CHECK-NEXT:    zip1 z10.b, z13.b, z12.b
-; CHECK-NEXT:    zip1 z27.b, z28.b, z27.b
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z9.b, z10.b, z9.b
+; CHECK-NEXT:    zip1 z10.b, z12.b, z11.b
 ; CHECK-NEXT:    zip1 z1.b, z0.b, z1.b
-; CHECK-NEXT:    zip1 z11.b, z14.b, z0.b
-; CHECK-NEXT:    zip1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    zip1 z28.b, z30.b, z29.b
-; CHECK-NEXT:    zip1 z2.b, z2.b, z31.b
-; CHECK-NEXT:    zip1 z18.h, z20.h, z18.h
-; CHECK-NEXT:    zip1 z20.h, z22.h, z21.h
-; CHECK-NEXT:    zip1 z21.h, z24.h, z23.h
-; CHECK-NEXT:    zip1 z1.h, z1.h, z8.h
-; CHECK-NEXT:    zip1 z19.h, z25.h, z19.h
-; CHECK-NEXT:    zip1 z22.h, z9.h, z11.h
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    zip1 z0.h, z10.h, z0.h
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    zip1 z23.h, z27.h, z26.h
-; CHECK-NEXT:    zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT:    zip1 z30.b, z13.b, z30.b
+; CHECK-NEXT:    mov z11.b, z0.b[13]
+; CHECK-NEXT:    mov z0.b, z0.b[5]
+; CHECK-NEXT:    zip1 z25.b, z26.b, z25.b
+; CHECK-NEXT:    zip1 z26.b, z28.b, z27.b
+; CHECK-NEXT:    zip1 z2.b, z2.b, z8.b
+; CHECK-NEXT:    zip1 z21.h, z22.h, z21.h
+; CHECK-NEXT:    zip1 z22.h, z24.h, z23.h
+; CHECK-NEXT:    zip1 z23.h, z31.h, z29.h
+; CHECK-NEXT:    zip1 z1.h, z1.h, z9.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z24.h, z10.h, z11.h
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 z0.h, z30.h, z0.h
+; CHECK-NEXT:    zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT:    zip1 z19.h, z25.h, z20.h
+; CHECK-NEXT:    zip1 z2.h, z2.h, z26.h
 ; CHECK-NEXT:    zip1 z3.s, z4.s, z3.s
-; CHECK-NEXT:    zip1 z4.s, z7.s, z5.s
-; CHECK-NEXT:    zip1 z5.s, z17.s, z16.s
-; CHECK-NEXT:    zip1 z1.s, z1.s, z21.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z22.s
-; CHECK-NEXT:    zip1 z6.s, z18.s, z6.s
-; CHECK-NEXT:    zip1 z7.s, z19.s, z20.s
-; CHECK-NEXT:    zip1 z2.s, z2.s, z23.s
+; CHECK-NEXT:    zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT:    zip1 z5.s, z16.s, z7.s
+; CHECK-NEXT:    zip1 z1.s, z1.s, z23.s
+; CHECK-NEXT:    zip1 z6.s, z21.s, z17.s
+; CHECK-NEXT:    zip1 z0.s, z0.s, z24.s
+; CHECK-NEXT:    zip1 z7.s, z18.s, z22.s
+; CHECK-NEXT:    zip1 z2.s, z2.s, z19.s
 ; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z6.d, z5.d
@@ -1569,7 +1564,7 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z0.b, z3.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    ldr d14, [sp], #64 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp], #48 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1909,29 +1904,26 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q6, q0, [x0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
-; CHECK-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z4.s, z0.s[2]
-; CHECK-NEXT:    mov z5.s, z0.s[3]
-; CHECK-NEXT:    mov z6.s, z0.s[1]
-; CHECK-NEXT:    mov z7.s, z1.s[1]
-; CHECK-NEXT:    mov z16.s, z3.s[2]
-; CHECK-NEXT:    mov z17.s, z2.s[2]
-; CHECK-NEXT:    zip1 z0.s, z0.s, z4.s
-; CHECK-NEXT:    zip1 z4.s, z6.s, z5.s
-; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    zip1 z3.s, z3.s, z16.s
-; CHECK-NEXT:    tbl z2.s, { z2.s }, z6.s
-; CHECK-NEXT:    zip1 z1.s, z1.s, z0.s
-; CHECK-NEXT:    zip1 z5.s, z7.s, z0.s
-; CHECK-NEXT:    zip1 z7.s, z0.s, z17.s
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip1 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z3.d, z7.d, z3.d
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z5.s, z0.s[1]
+; CHECK-NEXT:    mov z7.s, z2.s[2]
+; CHECK-NEXT:    mov z16.s, z1.s[2]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT:    zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT:    mov z4.s, z6.s[1]
+; CHECK-NEXT:    zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT:    zip1 z7.s, z0.s, z16.s
+; CHECK-NEXT:    tbl z1.s, { z1.s }, z5.s
+; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z3.s
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

>From c7a6b91b705d688f772fff7dac4ff3153576d4d1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 10 Oct 2024 13:00:32 +0000
Subject: [PATCH 3/4] Fixups and limit int types

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  45 +++---
 ...treaming-mode-fixed-length-build-vector.ll |  29 ++--
 ...streaming-mode-fixed-length-masked-load.ll | 134 +++++++----------
 ...treaming-mode-fixed-length-masked-store.ll | 140 +++++++-----------
 4 files changed, 149 insertions(+), 199 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 71115705407bd6..bb2a7587849c59 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14399,39 +14399,44 @@ SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
     return convertFromScalableVector(DAG, VT, Seq);
   }
 
+  unsigned NumElems = VT.getVectorNumElements();
   if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
-      VT.getVectorNumElements() <= 1 || BVN->isConstant())
+      NumElems <= 1 || BVN->isConstant())
+    return SDValue();
+
+  auto IsExtractElt = [](SDValue Op) {
+    return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+  };
+
+  // For integer types that are not already in vectors limit to at most four
+  // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
+  if (VT.getScalarType().isInteger() &&
+      NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
     return SDValue();
 
   // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
-  EVT ZipVT = ContainerVT;
   SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
-  SmallVector<SDValue, 16> Intermediates =
-      llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
-        SDValue Undef = DAG.getUNDEF(ZipVT);
+  SmallVector<SDValue, 16> Intermediates = llvm::map_to_vector<16>(
+      Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
         return Op.isUndef() ? Undef
-                            : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
-                                          Undef, Op, ZeroI64);
+                            : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+                                          ContainerVT, Undef, Op, ZeroI64);
       });
 
+  ElementCount ZipEC = ContainerVT.getVectorElementCount();
   while (Intermediates.size() > 1) {
-    auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
+    EVT ZipVT = getPackedSVEVectorVT(ZipEC);
+
     for (unsigned I = 0; I < Intermediates.size(); I += 2) {
-      SDValue Op0 = Intermediates[I + 0];
-      SDValue Op1 = Intermediates[I + 1];
-      Intermediates[I / 2] = Op1.isUndef()
-                                 ? Op0
-                                 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT,
-                                               ToZipVT(Op0), ToZipVT(Op1));
+      SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
+      SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
+      Intermediates[I / 2] =
+          Op1.isUndef() ? Op0
+                        : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
     }
 
     Intermediates.resize(Intermediates.size() / 2);
-    if (Intermediates.size() > 1) {
-      // Prefer FP values to keep elements within vector registers (and also as
-      // f16 is conveniently a legal type).
-      ZipVT = getPackedSVEVectorVT(EVT::getFloatingPointVT(
-          ZipVT.getVectorElementType().getSizeInBits() * 2));
-    }
+    ZipEC = ZipEC.divideCoefficientBy(2);
   }
 
   assert(Intermediates.size() == 1);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 9166dcbf62c4ef..9729a1d95cd916 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -430,23 +430,20 @@ define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
 define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
 ; CHECK-LABEL: build_vector_non_const_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w7
-; CHECK-NEXT:    fmov s1, w6
-; CHECK-NEXT:    ldr x8, [sp]
-; CHECK-NEXT:    fmov s2, w4
-; CHECK-NEXT:    fmov s3, w3
-; CHECK-NEXT:    fmov s4, w2
-; CHECK-NEXT:    fmov s5, w1
-; CHECK-NEXT:    fmov s6, w0
-; CHECK-NEXT:    zip1 z0.b, z1.b, z0.b
-; CHECK-NEXT:    fmov s1, w5
-; CHECK-NEXT:    zip1 z1.b, z2.b, z1.b
-; CHECK-NEXT:    zip1 z2.b, z4.b, z3.b
-; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
-; CHECK-NEXT:    zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT:    zip1 z1.h, z3.h, z2.h
-; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    strb w7, [sp, #15]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    strb w6, [sp, #14]
+; CHECK-NEXT:    strb w5, [sp, #13]
+; CHECK-NEXT:    strb w4, [sp, #12]
+; CHECK-NEXT:    strb w3, [sp, #11]
+; CHECK-NEXT:    strb w2, [sp, #10]
+; CHECK-NEXT:    strb w1, [sp, #9]
+; CHECK-NEXT:    strb w0, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 4980ee4d7f74b7..9055b2efba3282 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -676,105 +676,79 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v32i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldr w8, [sp, #224]
+; CHECK-NEXT:    ldr w9, [sp, #216]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    strb w7, [sp, #6]
+; CHECK-NEXT:    strb w8, [sp, #31]
+; CHECK-NEXT:    ldr w8, [sp, #208]
+; CHECK-NEXT:    strb w9, [sp, #30]
+; CHECK-NEXT:    ldr w9, [sp, #200]
+; CHECK-NEXT:    strb w8, [sp, #29]
 ; CHECK-NEXT:    ldr w8, [sp, #192]
+; CHECK-NEXT:    strb w9, [sp, #28]
 ; CHECK-NEXT:    ldr w9, [sp, #184]
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldr w10, [sp, #160]
-; CHECK-NEXT:    ldr w11, [sp, #144]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    strb w8, [sp, #27]
 ; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    strb w9, [sp, #26]
 ; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    fmov s3, w10
-; CHECK-NEXT:    fmov s4, w11
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    ldr w8, [sp, #152]
-; CHECK-NEXT:    ldr w10, [sp, #112]
-; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    strb w8, [sp, #25]
+; CHECK-NEXT:    ldr w8, [sp, #160]
+; CHECK-NEXT:    strb w9, [sp, #24]
+; CHECK-NEXT:    ldr w9, [sp, #152]
+; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    ldr w8, [sp, #144]
+; CHECK-NEXT:    strb w9, [sp, #22]
 ; CHECK-NEXT:    ldr w9, [sp, #136]
-; CHECK-NEXT:    ldr w11, [sp, #96]
-; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    strb w8, [sp, #21]
 ; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    zip1 z0.b, z1.b, z0.b
-; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    strb w9, [sp, #20]
 ; CHECK-NEXT:    ldr w9, [sp, #120]
-; CHECK-NEXT:    fmov s18, w10
-; CHECK-NEXT:    fmov s16, w8
-; CHECK-NEXT:    ldr w8, [sp, #104]
-; CHECK-NEXT:    zip1 z2.b, z6.b, z2.b
-; CHECK-NEXT:    fmov s17, w9
+; CHECK-NEXT:    strb w8, [sp, #19]
+; CHECK-NEXT:    ldr w8, [sp, #112]
+; CHECK-NEXT:    strb w9, [sp, #18]
+; CHECK-NEXT:    ldr w9, [sp, #104]
+; CHECK-NEXT:    strb w8, [sp, #17]
+; CHECK-NEXT:    ldr w8, [sp, #96]
+; CHECK-NEXT:    strb w9, [sp, #16]
 ; CHECK-NEXT:    ldr w9, [sp, #88]
-; CHECK-NEXT:    fmov s20, w11
-; CHECK-NEXT:    fmov s19, w8
+; CHECK-NEXT:    strb w8, [sp, #15]
 ; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    ldr w10, [sp, #64]
-; CHECK-NEXT:    fmov s21, w9
+; CHECK-NEXT:    strb w9, [sp, #14]
 ; CHECK-NEXT:    ldr w9, [sp, #72]
-; CHECK-NEXT:    ldr w11, [sp, #48]
-; CHECK-NEXT:    fmov s22, w8
-; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    zip1 z3.b, z5.b, z3.b
-; CHECK-NEXT:    fmov s23, w9
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    ldr w8, [sp, #64]
+; CHECK-NEXT:    strb w9, [sp, #12]
+; CHECK-NEXT:    ldr w9, [sp, #56]
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    ldr w8, [sp, #48]
+; CHECK-NEXT:    strb w9, [sp, #10]
 ; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    zip1 z4.b, z7.b, z4.b
-; CHECK-NEXT:    fmov s25, w8
+; CHECK-NEXT:    strb w8, [sp, #9]
 ; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    fmov s24, w10
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldr w9, [sp, #24]
-; CHECK-NEXT:    fmov s26, w11
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    ldr w8, [sp, #16]
-; CHECK-NEXT:    zip1 z16.b, z17.b, z16.b
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    ldr w9, [sp, #8]
-; CHECK-NEXT:    zip1 z17.b, z19.b, z18.b
-; CHECK-NEXT:    fmov s7, w8
-; CHECK-NEXT:    ldr w8, [sp]
-; CHECK-NEXT:    zip1 z19.b, z21.b, z20.b
-; CHECK-NEXT:    fmov s18, w9
-; CHECK-NEXT:    zip1 z20.b, z23.b, z22.b
-; CHECK-NEXT:    fmov s23, w7
-; CHECK-NEXT:    fmov s22, w8
-; CHECK-NEXT:    zip1 z21.b, z25.b, z24.b
-; CHECK-NEXT:    zip1 z1.b, z1.b, z26.b
-; CHECK-NEXT:    zip1 z5.b, z5.b, z6.b
-; CHECK-NEXT:    fmov s24, w3
-; CHECK-NEXT:    fmov s25, w2
-; CHECK-NEXT:    zip1 z6.b, z18.b, z7.b
-; CHECK-NEXT:    fmov s18, w6
-; CHECK-NEXT:    fmov s26, w1
-; CHECK-NEXT:    zip1 z7.b, z23.b, z22.b
-; CHECK-NEXT:    fmov s22, w5
-; CHECK-NEXT:    fmov s23, w4
-; CHECK-NEXT:    zip1 z0.h, z2.h, z0.h
-; CHECK-NEXT:    zip1 z2.h, z4.h, z3.h
-; CHECK-NEXT:    zip1 z3.h, z17.h, z16.h
-; CHECK-NEXT:    zip1 z4.h, z20.h, z19.h
-; CHECK-NEXT:    zip1 z1.h, z1.h, z21.h
-; CHECK-NEXT:    zip1 z5.h, z6.h, z5.h
-; CHECK-NEXT:    zip1 z18.b, z22.b, z18.b
-; CHECK-NEXT:    zip1 z22.b, z24.b, z23.b
+; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    strb w8, [sp, #7]
 ; CHECK-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEXT:    zip1 z23.b, z26.b, z25.b
-; CHECK-NEXT:    zip1 z0.s, z2.s, z0.s
-; CHECK-NEXT:    zip1 z2.s, z4.s, z3.s
-; CHECK-NEXT:    zip1 z1.s, z5.s, z1.s
-; CHECK-NEXT:    zip1 z6.h, z18.h, z7.h
-; CHECK-NEXT:    zip1 z7.h, z23.h, z22.h
-; CHECK-NEXT:    zip1 z0.d, z2.d, z0.d
-; CHECK-NEXT:    zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT:    strb w6, [sp, #5]
+; CHECK-NEXT:    strb w5, [sp, #4]
+; CHECK-NEXT:    strb w4, [sp, #3]
+; CHECK-NEXT:    strb w3, [sp, #2]
+; CHECK-NEXT:    strb w2, [sp, #1]
+; CHECK-NEXT:    strb w1, [sp]
+; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
-; CHECK-NEXT:    zip1 z1.d, z3.d, z1.d
-; CHECK-NEXT:    asr z0.b, z0.b, #7
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
-; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    asr z0.b, z0.b, #7
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x8]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 2966ab12b8cad6..265480b571970f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -293,104 +293,78 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v32i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldr w8, [sp, #96]
+; CHECK-NEXT:    ldr w9, [sp, #88]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldr w10, [sp, #120]
+; CHECK-NEXT:    strb w7, [sp, #6]
+; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    ldr w8, [sp, #80]
+; CHECK-NEXT:    strb w9, [sp, #14]
+; CHECK-NEXT:    ldr w9, [sp, #72]
+; CHECK-NEXT:    strb w8, [sp, #13]
 ; CHECK-NEXT:    ldr w8, [sp, #64]
+; CHECK-NEXT:    strb w9, [sp, #12]
 ; CHECK-NEXT:    ldr w9, [sp, #56]
-; CHECK-NEXT:    fmov s26, w2
-; CHECK-NEXT:    ldr w10, [sp, #32]
-; CHECK-NEXT:    ldr w11, [sp, #16]
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    strb w8, [sp, #11]
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    strb w9, [sp, #10]
 ; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    fmov s5, w10
-; CHECK-NEXT:    fmov s7, w11
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    ldr w8, [sp, #24]
-; CHECK-NEXT:    ldr w10, [sp, #176]
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    ldr w9, [sp, #8]
-; CHECK-NEXT:    ldr w11, [sp, #168]
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    ldr w8, [sp]
-; CHECK-NEXT:    fmov s19, w10
-; CHECK-NEXT:    fmov s16, w9
+; CHECK-NEXT:    strb w8, [sp, #9]
+; CHECK-NEXT:    ldr w8, [sp, #32]
+; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    ldr w9, [sp, #216]
+; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    ldr w8, [sp, #224]
+; CHECK-NEXT:    strb w9, [sp, #30]
+; CHECK-NEXT:    ldr w9, [sp, #200]
+; CHECK-NEXT:    strb w8, [sp, #31]
+; CHECK-NEXT:    ldr w8, [sp, #208]
+; CHECK-NEXT:    strb w9, [sp, #28]
 ; CHECK-NEXT:    ldr w9, [sp, #184]
-; CHECK-NEXT:    fmov s20, w11
-; CHECK-NEXT:    zip1 z4.b, z3.b, z2.b
-; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    strb w8, [sp, #29]
 ; CHECK-NEXT:    ldr w8, [sp, #192]
-; CHECK-NEXT:    fmov s18, w9
+; CHECK-NEXT:    strb w9, [sp, #26]
+; CHECK-NEXT:    ldr w9, [sp, #168]
+; CHECK-NEXT:    strb w8, [sp, #27]
+; CHECK-NEXT:    ldr w8, [sp, #176]
+; CHECK-NEXT:    strb w9, [sp, #24]
 ; CHECK-NEXT:    ldr w9, [sp, #152]
-; CHECK-NEXT:    ldr w10, [sp, #136]
-; CHECK-NEXT:    fmov s17, w8
+; CHECK-NEXT:    strb w8, [sp, #25]
 ; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    ldr w11, [sp, #120]
-; CHECK-NEXT:    fmov s21, w10
-; CHECK-NEXT:    ldr w10, [sp, #88]
-; CHECK-NEXT:    zip1 z1.b, z1.b, z0.b
-; CHECK-NEXT:    fmov s23, w11
-; CHECK-NEXT:    ldr w11, [sp, #72]
-; CHECK-NEXT:    zip1 z0.b, z6.b, z5.b
-; CHECK-NEXT:    zip1 z17.b, z18.b, z17.b
-; CHECK-NEXT:    zip1 z18.b, z20.b, z19.b
-; CHECK-NEXT:    fmov s19, w8
-; CHECK-NEXT:    fmov s20, w9
+; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    ldr w9, [sp, #136]
+; CHECK-NEXT:    strb w8, [sp, #23]
 ; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    ldr w9, [sp, #128]
-; CHECK-NEXT:    fmov s24, w10
-; CHECK-NEXT:    fmov s5, w7
-; CHECK-NEXT:    fmov s25, w11
-; CHECK-NEXT:    fmov s22, w9
-; CHECK-NEXT:    ldr w9, [sp, #104]
-; CHECK-NEXT:    zip1 z2.b, z16.b, z7.b
-; CHECK-NEXT:    zip1 z19.b, z20.b, z19.b
-; CHECK-NEXT:    fmov s20, w8
-; CHECK-NEXT:    ldr w8, [sp, #112]
-; CHECK-NEXT:    zip1 z3.b, z5.b, z3.b
-; CHECK-NEXT:    fmov s5, w6
-; CHECK-NEXT:    fmov s6, w5
-; CHECK-NEXT:    fmov s7, w4
-; CHECK-NEXT:    fmov s16, w3
-; CHECK-NEXT:    zip1 z1.h, z4.h, z1.h
-; CHECK-NEXT:    zip1 z20.b, z21.b, z20.b
-; CHECK-NEXT:    zip1 z21.b, z23.b, z22.b
-; CHECK-NEXT:    fmov s22, w8
-; CHECK-NEXT:    fmov s23, w9
-; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    ldr w9, [sp, #80]
-; CHECK-NEXT:    zip1 z5.b, z6.b, z5.b
-; CHECK-NEXT:    zip1 z6.b, z16.b, z7.b
-; CHECK-NEXT:    zip1 z4.h, z18.h, z17.h
-; CHECK-NEXT:    zip1 z16.h, z20.h, z19.h
-; CHECK-NEXT:    zip1 z0.h, z2.h, z0.h
-; CHECK-NEXT:    zip1 z22.b, z23.b, z22.b
-; CHECK-NEXT:    fmov s23, w8
+; CHECK-NEXT:    strb w9, [sp, #20]
+; CHECK-NEXT:    ldr w9, [sp, #112]
+; CHECK-NEXT:    strb w8, [sp, #21]
+; CHECK-NEXT:    ldr w8, [sp, #128]
+; CHECK-NEXT:    strb w6, [sp, #5]
+; CHECK-NEXT:    strb w8, [sp, #19]
+; CHECK-NEXT:    ldr w8, [sp, #104]
+; CHECK-NEXT:    strb w5, [sp, #4]
+; CHECK-NEXT:    strb w4, [sp, #3]
+; CHECK-NEXT:    strb w3, [sp, #2]
+; CHECK-NEXT:    strb w2, [sp, #1]
+; CHECK-NEXT:    strb w1, [sp]
+; CHECK-NEXT:    strb w10, [sp, #18]
+; CHECK-NEXT:    strb w9, [sp, #17]
+; CHECK-NEXT:    strb w8, [sp, #16]
 ; CHECK-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEXT:    zip1 z2.h, z5.h, z3.h
-; CHECK-NEXT:    zip1 z4.s, z16.s, z4.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z23.b, z24.b, z23.b
-; CHECK-NEXT:    fmov s24, w9
-; CHECK-NEXT:    zip1 z17.h, z22.h, z21.h
-; CHECK-NEXT:    zip1 z24.b, z25.b, z24.b
-; CHECK-NEXT:    fmov s25, w1
-; CHECK-NEXT:    zip1 z7.b, z25.b, z26.b
-; CHECK-NEXT:    zip1 z18.h, z24.h, z23.h
-; CHECK-NEXT:    zip1 z3.h, z7.h, z6.h
-; CHECK-NEXT:    zip1 z5.s, z18.s, z17.s
-; CHECK-NEXT:    zip1 z1.s, z3.s, z2.s
-; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    lsl z1.b, z2.b, #7
+; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
-; CHECK-NEXT:    asr z1.b, z1.b, #7
+; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    asr z0.b, z0.b, #7
-; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    asr z1.b, z1.b, #7
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p1, [x0, x8]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:

>From 6efd8c72f3deba8eb4a9f7d71063670fa63da208 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 10 Oct 2024 13:47:38 +0000
Subject: [PATCH 4/4] Remove llvm::

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bb2a7587849c59..759fb352823f83 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14416,7 +14416,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
 
   // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
   SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
-  SmallVector<SDValue, 16> Intermediates = llvm::map_to_vector<16>(
+  SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
       Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
         return Op.isUndef() ? Undef
                             : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,



More information about the llvm-commits mailing list