[llvm] [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (PR #111698)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 06:03:26 PDT 2024
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/111698
>From b38040c060e1346c1c7228c0b796e20d9ca3b2d9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 9 Oct 2024 13:56:49 +0000
Subject: [PATCH 1/3] [AArch64][SVE] Support lowering fixed-length
BUILD_VECTORS to ZIPs
This allows lowering fixed-length (non-constant) BUILD_VECTORS
(<= 128-bit) to a chain of ZIP1 instructions when Neon is not available,
rather than using the default lowering, which is to spill to the stack
and reload.
For example,
```
t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3)
```
Becomes:
```
zip1 z0.s, z0.s, z1.s // z0 = t0,t1,...
zip1 z2.s, z2.s, z3.s // z2 = t2,t3,...
zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,...
```
When values are already in FRPs, this generally seems to lead to a more
compact output with less movement to/from the stack.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 65 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 +
.../sve-fixed-length-vector-shuffle-tbl.ll | 265 ++--
...streaming-mode-fixed-length-and-combine.ll | 8 +-
...sve-streaming-mode-fixed-length-bitcast.ll | 11 +-
...treaming-mode-fixed-length-build-vector.ll | 255 ++++
.../sve-streaming-mode-fixed-length-concat.ll | 70 +-
...e-streaming-mode-fixed-length-ext-loads.ll | 44 +-
...ing-mode-fixed-length-extract-subvector.ll | 32 +-
...e-streaming-mode-fixed-length-fcopysign.ll | 52 +-
...e-streaming-mode-fixed-length-fp-to-int.ll | 692 +++++-----
...-streaming-mode-fixed-length-fp-vselect.ll | 13 +-
...ing-mode-fixed-length-insert-vector-elt.ll | 10 +-
...e-streaming-mode-fixed-length-int-to-fp.ll | 32 +-
...-streaming-mode-fixed-length-ld2-alloca.ll | 54 +-
...streaming-mode-fixed-length-masked-load.ll | 218 ++--
...treaming-mode-fixed-length-masked-store.ll | 198 +--
...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1140 +++++++----------
...e-streaming-mode-fixed-length-reshuffle.ll | 16 +-
19 files changed, 1518 insertions(+), 1658 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 288fd3639e5eb7..6c1c33da4be996 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
@@ -2102,7 +2103,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
setOperationAction(ISD::BITREVERSE, VT, Default);
setOperationAction(ISD::BSWAP, VT, Default);
- setOperationAction(ISD::BUILD_VECTOR, VT, Default);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
setOperationAction(ISD::CTLZ, VT, Default);
setOperationAction(ISD::CTPOP, VT, Default);
@@ -14384,24 +14385,62 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ auto *BVN = cast<BuildVectorSDNode>(Op);
- if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
- if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
- SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
- SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
- return convertFromScalableVector(DAG, Op.getValueType(), Seq);
- }
+ if (auto SeqInfo = BVN->isConstantSequence()) {
+ SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
+ SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
+ SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
+ return convertFromScalableVector(DAG, VT, Seq);
+ }
- // Revert to common legalisation for all other variants.
+ if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
+ VT.getVectorNumElements() <= 1 || BVN->isConstant())
return SDValue();
+
+ // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
+ EVT ZipVT = ContainerVT;
+ SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
+ SmallVector<SDValue, 16> Intermediates =
+ llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
+ DAG.getUNDEF(ZipVT), Op, ZeroI64);
+ });
+
+ while (Intermediates.size() > 1) {
+ auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
+ for (unsigned I = 0; I < Intermediates.size(); I += 2) {
+ SDValue Op0 = ToZipVT(Intermediates[I + 0]);
+ SDValue Op1 = ToZipVT(Intermediates[I + 1]);
+ Intermediates[I / 2] = DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+ }
+
+ Intermediates.resize(Intermediates.size() / 2);
+ if (Intermediates.size() > 1) {
+ // Prefer FP values to keep elements within vector registers (and also as
+ // f16 is conveniently a legal type).
+ ZipVT = getPackedSVEVectorVT(EVT::getFloatingPointVT(
+ ZipVT.getVectorElementType().getSizeInBits() * 2));
+ }
}
+ assert(Intermediates.size() == 1);
+ SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
+ return convertFromScalableVector(DAG, VT, Vec);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+ return LowerFixedLengthBuildVectorToSVE(Op, DAG);
+
// Try to build a simple constant vector.
Op = NormalizeBuildVector(Op, DAG);
// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1bae7562f459a5..95489f85631801 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1256,6 +1256,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 276f23703df3df..a22c00c1ebce14 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w9, s0
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -263,89 +230,62 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w8, s0
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w9, s1
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -401,34 +341,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: mov z1.b, z0.b[7]
-; CHECK-NEXT: mov z2.b, z0.b[6]
-; CHECK-NEXT: mov z3.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strb w9, [sp, #11]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #9]
-; CHECK-NEXT: strb w8, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z4.b, z0.b[1]
+; CHECK-NEXT: mov z1.b, z1.b[1]
+; CHECK-NEXT: mov z5.b, z0.b[7]
+; CHECK-NEXT: mov z6.b, z0.b[6]
+; CHECK-NEXT: mov z0.b, z0.b[4]
+; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index 617b560713c3ab..478072d33d8c9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
; CHECK-LABEL: vls_sve_and_2xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fmov s1, wzr
; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: stp wzr, w8, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index b9264ad5f77c37..172e2454d70283 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -91,19 +91,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
define void @bitcast_v2i16(ptr %a, ptr %b) {
; CHECK-LABEL: bitcast_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: str w8, [x1]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index b8a2e0e0f4bd4c..9166dcbf62c4ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -222,3 +222,258 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
ret void
}
+
+define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr w8, w0, w1, lsl #1
+; CHECK-NEXT: orr w8, w8, w2, lsl #2
+; CHECK-NEXT: orr w8, w8, w3, lsl #3
+; CHECK-NEXT: strb w8, [x4]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: orr w8, w0, w1, lsl #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w2, lsl #2
+; NONEON-NOSVE-NEXT: orr w8, w8, w3, lsl #3
+; NONEON-NOSVE-NEXT: strb w8, [x4]
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+ %2 = insertelement <4 x i1> %1, i1 %b, i64 1
+ %3 = insertelement <4 x i1> %2, i1 %c, i64 2
+ %4 = insertelement <4 x i1> %3, i1 %d, i64 3
+ store <4 x i1> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x double> undef, double %a, i64 0
+ %2 = insertelement <2 x double> %1, double %b, i64 1
+ store <2 x double> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x float> undef, float %a, i64 0
+ %2 = insertelement <2 x float> %1, float %b, i64 1
+ store <2 x float> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s3 killed $s3 def $z3
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s2, s3, [sp, #8]
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x float> undef, float %a, i64 0
+ %2 = insertelement <4 x float> %1, float %b, i64 1
+ %3 = insertelement <4 x float> %2, float %c, i64 2
+ %4 = insertelement <4 x float> %3, float %d, i64 3
+ store <4 x float> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d3 killed $d3 def $z3
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d2, d3, [sp, #16]
+; NONEON-NOSVE-NEXT: ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x double> undef, double %a, i64 0
+ %2 = insertelement <4 x double> %1, double %b, i64 1
+ %3 = insertelement <4 x double> %2, double %c, i64 2
+ %4 = insertelement <4 x double> %3, double %d, i64 3
+ store <4 x double> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h6 killed $h6 def $z6
+; CHECK-NEXT: // kill: def $h4 killed $h4 def $z4
+; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: // kill: def $h7 killed $h7 def $z7
+; CHECK-NEXT: // kill: def $h5 killed $h5 def $z5
+; CHECK-NEXT: // kill: def $h3 killed $h3 def $z3
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $z1
+; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z1.s, z4.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: str h7, [sp, #14]
+; NONEON-NOSVE-NEXT: str h6, [sp, #12]
+; NONEON-NOSVE-NEXT: str h5, [sp, #10]
+; NONEON-NOSVE-NEXT: str h4, [sp, #8]
+; NONEON-NOSVE-NEXT: str h3, [sp, #6]
+; NONEON-NOSVE-NEXT: str h2, [sp, #4]
+; NONEON-NOSVE-NEXT: str h1, [sp, #2]
+; NONEON-NOSVE-NEXT: str h0, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <8 x half> undef, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %d, i64 3
+ %5 = insertelement <8 x half> %4, half %e, i64 4
+ %6 = insertelement <8 x half> %5, half %f, i64 5
+ %7 = insertelement <8 x half> %6, half %g, i64 6
+ %8 = insertelement <8 x half> %7, half %h, i64 7
+ store <8 x half> %8, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: str d0, [x2]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp w0, w1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x2]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x i32> undef, i32 %a, i64 0
+ %2 = insertelement <2 x i32> %1, i32 %b, i64 1
+ store <2 x i32> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w7
+; CHECK-NEXT: fmov s1, w6
+; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: fmov s2, w4
+; CHECK-NEXT: fmov s3, w3
+; CHECK-NEXT: fmov s4, w2
+; CHECK-NEXT: fmov s5, w1
+; CHECK-NEXT: fmov s6, w0
+; CHECK-NEXT: zip1 z0.b, z1.b, z0.b
+; CHECK-NEXT: fmov s1, w5
+; CHECK-NEXT: zip1 z1.b, z2.b, z1.b
+; CHECK-NEXT: zip1 z2.b, z4.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: zip1 z1.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: strb w7, [sp, #15]
+; NONEON-NOSVE-NEXT: ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT: strb w6, [sp, #14]
+; NONEON-NOSVE-NEXT: strb w5, [sp, #13]
+; NONEON-NOSVE-NEXT: strb w4, [sp, #12]
+; NONEON-NOSVE-NEXT: strb w3, [sp, #11]
+; NONEON-NOSVE-NEXT: strb w2, [sp, #10]
+; NONEON-NOSVE-NEXT: strb w1, [sp, #9]
+; NONEON-NOSVE-NEXT: strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x8]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <8 x i8> undef, i8 %a, i64 0
+ %2 = insertelement <8 x i8> %1, i8 %b, i64 1
+ %3 = insertelement <8 x i8> %2, i8 %c, i64 2
+ %4 = insertelement <8 x i8> %3, i8 %d, i64 3
+ %5 = insertelement <8 x i8> %4, i8 %e, i64 4
+ %6 = insertelement <8 x i8> %5, i8 %f, i64 5
+ %7 = insertelement <8 x i8> %6, i8 %g, i64 6
+ %8 = insertelement <8 x i8> %7, i8 %h, i64 7
+ store <8 x i8> %8, ptr %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 4b6285b2732fe5..c1810c678ea522 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu"
define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-LABEL: concat_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: mov z2.h, z1.h[3]
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: mov z2.h, z1.h[3]
; CHECK-NEXT: mov z3.h, z1.h[2]
-; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z0.h[3]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strb w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z5.h, z0.h[3]
+; CHECK-NEXT: mov z6.h, z0.h[2]
+; CHECK-NEXT: mov z7.h, z0.h[1]
+; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z7.b
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v8i8:
@@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-LABEL: concat_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.s, z1.s[1]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4i16:
@@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
; CHECK-LABEL: concat_v4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: str h1, [sp, #12]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h2, [sp, #14]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 50a05cb4b1e277..7d6336a43a4fd1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
; CHECK-LABEL: load_sext_v2i64i256:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: mov z1.d, z0.d[1]
-; CHECK-NEXT: asr x9, x8, #63
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: asr x8, x10, #63
-; CHECK-NEXT: mov z0.d, x9
-; CHECK-NEXT: stp x10, x8, [sp, #16]
-; CHECK-NEXT: mov z1.d, x8
-; CHECK-NEXT: ldp q2, q4, [sp], #32
-; CHECK-NEXT: mov z3.d, z0.d[1]
-; CHECK-NEXT: mov z5.d, z1.d[1]
-; CHECK-NEXT: mov z6.d, z2.d[1]
-; CHECK-NEXT: fmov x2, d0
-; CHECK-NEXT: mov z0.d, z4.d[1]
-; CHECK-NEXT: fmov x6, d1
-; CHECK-NEXT: fmov x0, d2
-; CHECK-NEXT: fmov x4, d4
-; CHECK-NEXT: fmov x3, d3
-; CHECK-NEXT: fmov x7, d5
-; CHECK-NEXT: fmov x1, d6
-; CHECK-NEXT: fmov x5, d0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: asr x8, x8, #63
+; CHECK-NEXT: fmov d3, x8
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: asr x9, x9, #63
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT: mov z3.d, x9
+; CHECK-NEXT: fmov x2, d2
+; CHECK-NEXT: zip1 z1.d, z1.d, z4.d
+; CHECK-NEXT: mov z4.d, z2.d[1]
+; CHECK-NEXT: mov z5.d, z0.d[1]
+; CHECK-NEXT: mov z6.d, z3.d[1]
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fmov x6, d3
+; CHECK-NEXT: mov z2.d, z1.d[1]
+; CHECK-NEXT: fmov x3, d4
+; CHECK-NEXT: fmov x1, d5
+; CHECK-NEXT: fmov x4, d1
+; CHECK-NEXT: fmov x7, d6
+; CHECK-NEXT: fmov x5, d2
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 2665696308463f..a728cbe97056db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
; CHECK-LABEL: extract_subvector_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
@@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
; CHECK-LABEL: extract_subvector_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index dad53b31db0b0f..f1771a753826cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
; SVE-LABEL: test_copysign_v4f16_v4f64:
; SVE: // %bb.0:
-; SVE-NEXT: sub sp, sp, #16
-; SVE-NEXT: .cfi_def_cfa_offset 16
-; SVE-NEXT: ldp q1, q0, [x1]
-; SVE-NEXT: ldr d4, [x0]
-; SVE-NEXT: and z4.h, z4.h, #0x7fff
-; SVE-NEXT: mov z2.d, z0.d[1]
-; SVE-NEXT: mov z3.d, z1.d[1]
-; SVE-NEXT: fcvt h0, d0
+; SVE-NEXT: ldp q0, q1, [x1]
+; SVE-NEXT: mov z2.d, z1.d[1]
+; SVE-NEXT: mov z3.d, z0.d[1]
; SVE-NEXT: fcvt h1, d1
+; SVE-NEXT: fcvt h0, d0
; SVE-NEXT: fcvt h2, d2
; SVE-NEXT: fcvt h3, d3
-; SVE-NEXT: str h0, [sp, #12]
-; SVE-NEXT: str h1, [sp, #8]
-; SVE-NEXT: str h2, [sp, #14]
-; SVE-NEXT: str h3, [sp, #10]
-; SVE-NEXT: ldr d0, [sp, #8]
+; SVE-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE-NEXT: zip1 z0.s, z0.s, z1.s
+; SVE-NEXT: ldr d1, [x0]
+; SVE-NEXT: and z1.h, z1.h, #0x7fff
; SVE-NEXT: and z0.h, z0.h, #0x8000
-; SVE-NEXT: orr z0.d, z4.d, z0.d
+; SVE-NEXT: orr z0.d, z1.d, z0.d
; SVE-NEXT: str d0, [x0]
-; SVE-NEXT: add sp, sp, #16
; SVE-NEXT: ret
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
-; SVE2-NEXT: sub sp, sp, #16
-; SVE2-NEXT: .cfi_def_cfa_offset 16
-; SVE2-NEXT: ldp q2, q1, [x1]
-; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT: ldr d5, [x0]
-; SVE2-NEXT: mov z3.d, z1.d[1]
-; SVE2-NEXT: mov z4.d, z2.d[1]
+; SVE2-NEXT: ldp q0, q1, [x1]
+; SVE2-NEXT: mov z2.d, z1.d[1]
+; SVE2-NEXT: mov z3.d, z0.d[1]
; SVE2-NEXT: fcvt h1, d1
+; SVE2-NEXT: fcvt h0, d0
; SVE2-NEXT: fcvt h2, d2
; SVE2-NEXT: fcvt h3, d3
-; SVE2-NEXT: fcvt h4, d4
-; SVE2-NEXT: str h1, [sp, #12]
-; SVE2-NEXT: str h2, [sp, #8]
-; SVE2-NEXT: str h3, [sp, #14]
-; SVE2-NEXT: str h4, [sp, #10]
-; SVE2-NEXT: ldr d1, [sp, #8]
-; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d
-; SVE2-NEXT: str d5, [x0]
-; SVE2-NEXT: add sp, sp, #16
+; SVE2-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2-NEXT: mov z2.h, #32767 // =0x7fff
+; SVE2-NEXT: zip1 z0.s, z0.s, z1.s
+; SVE2-NEXT: ldr d1, [x0]
+; SVE2-NEXT: bsl z1.d, z1.d, z0.d, z2.d
+; SVE2-NEXT: str d1, [x0]
; SVE2-NEXT: ret
;
; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index a206fbc5102953..11fee267660c03 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: fcvtzu x8, h0
; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: stp x8, x9, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
@@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v4f16_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fcvtzu x8, h0
-; CHECK-NEXT: mov z2.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: fcvtzu x10, h2
-; CHECK-NEXT: fcvtzu x11, h0
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: mov z1.h, z0.h[3]
+; CHECK-NEXT: mov z2.h, z0.h[2]
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: fcvtzu x10, h0
+; CHECK-NEXT: fcvtzu x8, h1
+; CHECK-NEXT: fcvtzu x9, h2
+; CHECK-NEXT: fcvtzu x11, h3
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x11
+; CHECK-NEXT: zip1 z1.d, z2.d, z1.d
; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: fcvtzu x12, h0
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z3.h, z0.h[2]
+; CHECK-NEXT: mov z4.h, z0.h[1]
+; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: fcvtzu x8, h1
-; CHECK-NEXT: mov z3.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: fcvtzu x9, h2
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: fcvtzu x10, h3
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzu x11, h1
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-64]!
-; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: fcvtzu x8, h2
; CHECK-NEXT: fcvtzu x9, h3
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: fcvtzu x10, h0
-; CHECK-NEXT: ldp q2, q3, [sp]
-; CHECK-NEXT: stp x12, x8, [sp, #32]
-; CHECK-NEXT: stp x10, x9, [sp, #48]
-; CHECK-NEXT: ldp q1, q0, [sp, #32]
-; CHECK-NEXT: stp q2, q3, [x1, #32]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: fcvtzu x11, h4
+; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: mov z6.h, z1.h[2]
+; CHECK-NEXT: mov z2.h, z1.h[1]
+; CHECK-NEXT: fcvtzu x14, h1
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: fcvtzu x12, h5
+; CHECK-NEXT: fcvtzu x13, h6
+; CHECK-NEXT: fcvtzu x15, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x12
+; CHECK-NEXT: fmov d4, x13
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: fmov d3, x14
+; CHECK-NEXT: zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT: fmov d4, x15
+; CHECK-NEXT: stp q2, q0, [x1]
+; CHECK-NEXT: zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT: stp q3, q1, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v16f16_v16i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z6.h, z1.h[3]
+; CHECK-NEXT: fcvtzu x9, h1
+; CHECK-NEXT: fcvtzu x8, h0
+; CHECK-NEXT: mov z7.h, z0.h[1]
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT: mov z4.h, z2.h[1]
-; CHECK-NEXT: fcvtzu x8, h2
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[2]
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: fcvtzu x9, h4
-; CHECK-NEXT: mov z4.h, z3.h[1]
-; CHECK-NEXT: fcvtzu x10, h5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: fcvtzu x11, h2
-; CHECK-NEXT: mov z2.h, z3.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-128]!
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: fcvtzu x8, h4
-; CHECK-NEXT: fcvtzu x9, h5
-; CHECK-NEXT: stp x11, x10, [sp, #16]
+; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8
; CHECK-NEXT: fcvtzu x10, h2
-; CHECK-NEXT: mov z3.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z1.h[3]
-; CHECK-NEXT: fcvtzu x11, h1
+; CHECK-NEXT: fcvtzu x11, h4
+; CHECK-NEXT: fcvtzu x12, h6
; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: stp x12, x8, [sp, #64]
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: fcvtzu x8, h4
-; CHECK-NEXT: stp x10, x9, [sp, #80]
-; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x11, x12, [sp, #32]
-; CHECK-NEXT: fcvtzu x11, h2
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: stp x9, x8, [sp, #48]
-; CHECK-NEXT: fcvtzu x8, h0
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: ldp q3, q4, [sp, #64]
-; CHECK-NEXT: stp x10, x11, [sp, #96]
-; CHECK-NEXT: ldp q6, q7, [sp, #32]
-; CHECK-NEXT: stp x8, x12, [sp, #112]
-; CHECK-NEXT: ldp q5, q2, [sp, #96]
-; CHECK-NEXT: stp q0, q1, [x1, #32]
-; CHECK-NEXT: stp q6, q7, [x1]
-; CHECK-NEXT: stp q3, q4, [x1, #96]
-; CHECK-NEXT: stp q5, q2, [x1, #64]
-; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: fmov d16, x9
+; CHECK-NEXT: mov z2.h, z3.h[3]
+; CHECK-NEXT: mov z4.h, z5.h[3]
+; CHECK-NEXT: fcvtzu x14, h3
+; CHECK-NEXT: fcvtzu x13, h1
+; CHECK-NEXT: fcvtzu x15, h5
+; CHECK-NEXT: mov z1.h, z3.h[1]
+; CHECK-NEXT: mov z6.h, z5.h[1]
+; CHECK-NEXT: mov z5.h, z5.h[2]
+; CHECK-NEXT: mov z3.h, z3.h[2]
+; CHECK-NEXT: fcvtzu x9, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fcvtzu x10, h4
+; CHECK-NEXT: fmov d4, x11
+; CHECK-NEXT: fcvtzu x11, h7
+; CHECK-NEXT: fmov d7, x12
+; CHECK-NEXT: fcvtzu x12, h0
+; CHECK-NEXT: fmov d0, x13
+; CHECK-NEXT: fcvtzu x13, h1
+; CHECK-NEXT: fmov d1, x14
+; CHECK-NEXT: fcvtzu x14, h6
+; CHECK-NEXT: fmov d6, x15
+; CHECK-NEXT: fcvtzu x15, h5
+; CHECK-NEXT: fmov d5, x9
+; CHECK-NEXT: fcvtzu x9, h3
+; CHECK-NEXT: zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT: fmov d16, x8
+; CHECK-NEXT: zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT: fmov d3, x12
+; CHECK-NEXT: fmov d7, x10
+; CHECK-NEXT: stp q4, q0, [x1, #64]
+; CHECK-NEXT: fmov d0, x14
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT: zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT: fmov d16, x15
+; CHECK-NEXT: stp q3, q2, [x1]
+; CHECK-NEXT: fmov d2, x13
+; CHECK-NEXT: zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT: stp q0, q7, [x1, #96]
+; CHECK-NEXT: stp q1, q4, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
; CHECK-LABEL: fcvtzu_v4f64_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: mov z2.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z1.s[1]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: mov z2.s, z1.s[1]
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
@@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
; CHECK-LABEL: fcvtzu_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldp q1, q0, [x0, #32]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldp q2, q3, [x0]
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.s, z3.s[1]
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT: mov z4.s, z0.s[1]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z3.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
@@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q0, q1, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x0, #96]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: ldp q4, q5, [x0, #96]
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: ldp q0, q4, [x0, #32]
+; CHECK-NEXT: ldp q2, q7, [x0, #64]
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d
; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT: ldp q6, q7, [x0, #64]
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s
; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z16.s, z1.s[1]
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z0.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z3.s[1]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: movprfx z3, z7
-; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT: mov z3.s, z5.s[1]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: ldp q1, q0, [sp]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: mov z17.s, z6.s[1]
+; CHECK-NEXT: mov z16.s, z4.s[1]
+; CHECK-NEXT: mov z18.s, z5.s[1]
+; CHECK-NEXT: mov z21.s, z0.s[1]
+; CHECK-NEXT: mov z19.s, z7.s[1]
+; CHECK-NEXT: mov z20.s, z2.s[1]
+; CHECK-NEXT: mov z22.s, z3.s[1]
+; CHECK-NEXT: mov z23.s, z1.s[1]
+; CHECK-NEXT: zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT: zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT: zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: stp q0, q2, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
@@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: stp x8, x9, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
@@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v4f16_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fcvtzs x8, h0
-; CHECK-NEXT: mov z2.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: fcvtzs x10, h2
-; CHECK-NEXT: fcvtzs x11, h0
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: mov z1.h, z0.h[3]
+; CHECK-NEXT: mov z2.h, z0.h[2]
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: fcvtzs x10, h0
+; CHECK-NEXT: fcvtzs x8, h1
+; CHECK-NEXT: fcvtzs x9, h2
+; CHECK-NEXT: fcvtzs x11, h3
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x11
+; CHECK-NEXT: zip1 z1.d, z2.d, z1.d
; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: fcvtzs x12, h0
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z3.h, z0.h[2]
+; CHECK-NEXT: mov z4.h, z0.h[1]
+; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: fcvtzs x8, h1
-; CHECK-NEXT: mov z3.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: fcvtzs x9, h2
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: fcvtzs x10, h3
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzs x11, h1
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-64]!
-; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: fcvtzs x8, h2
; CHECK-NEXT: fcvtzs x9, h3
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: fcvtzs x10, h0
-; CHECK-NEXT: ldp q2, q3, [sp]
-; CHECK-NEXT: stp x12, x8, [sp, #32]
-; CHECK-NEXT: stp x10, x9, [sp, #48]
-; CHECK-NEXT: ldp q1, q0, [sp, #32]
-; CHECK-NEXT: stp q2, q3, [x1, #32]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: fcvtzs x11, h4
+; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: mov z6.h, z1.h[2]
+; CHECK-NEXT: mov z2.h, z1.h[1]
+; CHECK-NEXT: fcvtzs x14, h1
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: fcvtzs x12, h5
+; CHECK-NEXT: fcvtzs x13, h6
+; CHECK-NEXT: fcvtzs x15, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x12
+; CHECK-NEXT: fmov d4, x13
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: fmov d3, x14
+; CHECK-NEXT: zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT: fmov d4, x15
+; CHECK-NEXT: stp q2, q0, [x1]
+; CHECK-NEXT: zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT: stp q3, q1, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v16f16_v16i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z6.h, z1.h[3]
+; CHECK-NEXT: fcvtzs x9, h1
+; CHECK-NEXT: fcvtzs x8, h0
+; CHECK-NEXT: mov z7.h, z0.h[1]
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT: mov z4.h, z2.h[1]
-; CHECK-NEXT: fcvtzs x8, h2
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[2]
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: fcvtzs x9, h4
-; CHECK-NEXT: mov z4.h, z3.h[1]
-; CHECK-NEXT: fcvtzs x10, h5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: fcvtzs x11, h2
-; CHECK-NEXT: mov z2.h, z3.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-128]!
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: fcvtzs x8, h4
-; CHECK-NEXT: fcvtzs x9, h5
-; CHECK-NEXT: stp x11, x10, [sp, #16]
+; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8
; CHECK-NEXT: fcvtzs x10, h2
-; CHECK-NEXT: mov z3.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z1.h[3]
-; CHECK-NEXT: fcvtzs x11, h1
+; CHECK-NEXT: fcvtzs x11, h4
+; CHECK-NEXT: fcvtzs x12, h6
; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: stp x12, x8, [sp, #64]
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: fcvtzs x8, h4
-; CHECK-NEXT: stp x10, x9, [sp, #80]
-; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x11, x12, [sp, #32]
-; CHECK-NEXT: fcvtzs x11, h2
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: stp x9, x8, [sp, #48]
-; CHECK-NEXT: fcvtzs x8, h0
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: ldp q3, q4, [sp, #64]
-; CHECK-NEXT: stp x10, x11, [sp, #96]
-; CHECK-NEXT: ldp q6, q7, [sp, #32]
-; CHECK-NEXT: stp x8, x12, [sp, #112]
-; CHECK-NEXT: ldp q5, q2, [sp, #96]
-; CHECK-NEXT: stp q0, q1, [x1, #32]
-; CHECK-NEXT: stp q6, q7, [x1]
-; CHECK-NEXT: stp q3, q4, [x1, #96]
-; CHECK-NEXT: stp q5, q2, [x1, #64]
-; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: fmov d16, x9
+; CHECK-NEXT: mov z2.h, z3.h[3]
+; CHECK-NEXT: mov z4.h, z5.h[3]
+; CHECK-NEXT: fcvtzs x14, h3
+; CHECK-NEXT: fcvtzs x13, h1
+; CHECK-NEXT: fcvtzs x15, h5
+; CHECK-NEXT: mov z1.h, z3.h[1]
+; CHECK-NEXT: mov z6.h, z5.h[1]
+; CHECK-NEXT: mov z5.h, z5.h[2]
+; CHECK-NEXT: mov z3.h, z3.h[2]
+; CHECK-NEXT: fcvtzs x9, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fcvtzs x10, h4
+; CHECK-NEXT: fmov d4, x11
+; CHECK-NEXT: fcvtzs x11, h7
+; CHECK-NEXT: fmov d7, x12
+; CHECK-NEXT: fcvtzs x12, h0
+; CHECK-NEXT: fmov d0, x13
+; CHECK-NEXT: fcvtzs x13, h1
+; CHECK-NEXT: fmov d1, x14
+; CHECK-NEXT: fcvtzs x14, h6
+; CHECK-NEXT: fmov d6, x15
+; CHECK-NEXT: fcvtzs x15, h5
+; CHECK-NEXT: fmov d5, x9
+; CHECK-NEXT: fcvtzs x9, h3
+; CHECK-NEXT: zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT: fmov d16, x8
+; CHECK-NEXT: zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT: fmov d3, x12
+; CHECK-NEXT: fmov d7, x10
+; CHECK-NEXT: stp q4, q0, [x1, #64]
+; CHECK-NEXT: fmov d0, x14
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT: zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT: fmov d16, x15
+; CHECK-NEXT: stp q3, q2, [x1]
+; CHECK-NEXT: fmov d2, x13
+; CHECK-NEXT: zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT: stp q0, q7, [x1, #96]
+; CHECK-NEXT: stp q1, q4, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
; CHECK-LABEL: fcvtzs_v4f64_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: mov z2.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z1.s[1]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: mov z2.s, z1.s[1]
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
@@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
; CHECK-LABEL: fcvtzs_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldp q1, q0, [x0, #32]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldp q2, q3, [x0]
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.s, z3.s[1]
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT: mov z4.s, z0.s[1]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z3.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
@@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q0, q1, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x0, #96]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: ldp q4, q5, [x0, #96]
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: ldp q0, q4, [x0, #32]
+; CHECK-NEXT: ldp q2, q7, [x0, #64]
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d
; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT: ldp q6, q7, [x0, #64]
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s
; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z16.s, z1.s[1]
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z0.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z3.s[1]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: movprfx z3, z7
-; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT: mov z3.s, z5.s[1]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: ldp q1, q0, [sp]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: mov z17.s, z6.s[1]
+; CHECK-NEXT: mov z16.s, z4.s[1]
+; CHECK-NEXT: mov z18.s, z5.s[1]
+; CHECK-NEXT: mov z21.s, z0.s[1]
+; CHECK-NEXT: mov z19.s, z7.s[1]
+; CHECK-NEXT: mov z20.s, z2.s[1]
+; CHECK-NEXT: mov z22.s, z3.s[1]
+; CHECK-NEXT: mov z23.s, z1.s[1]
+; CHECK-NEXT: zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT: zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT: zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: stp q0, q2, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 035c76b569298a..e3c89981cb27af 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -8,25 +8,20 @@ target triple = "aarch64-unknown-linux-gnu"
define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) {
; CHECK-LABEL: select_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
-; CHECK-NEXT: mov z3.s, z2.s[1]
-; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: mov z4.s, z2.s[1]
+; CHECK-NEXT: zip1 z3.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d2, [sp, #8]
+; CHECK-NEXT: zip1 z2.h, z2.h, z4.h
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
; CHECK-NEXT: lsl z2.h, z2.h, #15
; CHECK-NEXT: asr z2.h, z2.h, #15
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: select_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index d77473ed8f08e5..87e3d0d09817ba 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -506,14 +506,12 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
; CHECK-LABEL: insertelement_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: insertelement_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index afd3bb7161c155..f71bfb770b15f4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1160,18 +1160,16 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: ucvtf_v2i64_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: mov z1.d, z0.d[1]
+; CHECK-NEXT: mov z2.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT: fmov x9, d2
; CHECK-NEXT: ucvtf h0, x8
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: ucvtf h1, x8
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ucvtf h2, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
@@ -2618,18 +2616,16 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: scvtf_v2i64_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: mov z1.d, z0.d[1]
+; CHECK-NEXT: mov z2.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
+; CHECK-NEXT: fmov x9, d2
; CHECK-NEXT: scvtf h0, x8
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: scvtf h1, x8
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: scvtf h2, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 270f05a806b82d..ef6b1c9acbf105 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -10,25 +10,20 @@ declare void @def(ptr)
define void @alloc_v4i8(ptr %st_ptr) nounwind {
; CHECK-LABEL: alloc_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #28
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: add x20, sp, #28
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x20, sp, #12
; CHECK-NEXT: bl def
; CHECK-NEXT: ptrue p0.b, vl2
; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20]
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: mov z2.b, z0.b[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
; CHECK-NEXT: st1b { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: alloc_v4i8:
@@ -62,32 +57,29 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-LABEL: alloc_v6i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x0, sp, #8
; CHECK-NEXT: bl def
-; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d0, [sp, #8]
; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: add x8, sp, #4
; CHECK-NEXT: ptrue p1.s, vl2
; CHECK-NEXT: mov z1.b, z0.b[3]
-; CHECK-NEXT: mov z2.b, z0.b[5]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: add x8, sp, #20
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: st1b { z0.h }, p0, [x8]
-; CHECK-NEXT: ld1h { z0.s }, p1/z, [x8]
-; CHECK-NEXT: strb w9, [x19, #2]
+; CHECK-NEXT: mov z2.b, z0.b[1]
+; CHECK-NEXT: mov z0.b, z0.b[5]
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z2.h, z0.h, z0.h
+; CHECK-NEXT: zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT: st1b { z1.h }, p0, [x8]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x19, #2]
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strh w8, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: alloc_v6i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 5f4b9dd1592cf2..1b90aed22f9d8d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -676,79 +676,105 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; CHECK-LABEL: masked_load_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldr w8, [sp, #224]
-; CHECK-NEXT: ldr w9, [sp, #216]
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: strb w7, [sp, #6]
-; CHECK-NEXT: strb w8, [sp, #31]
-; CHECK-NEXT: ldr w8, [sp, #208]
-; CHECK-NEXT: strb w9, [sp, #30]
-; CHECK-NEXT: ldr w9, [sp, #200]
-; CHECK-NEXT: strb w8, [sp, #29]
; CHECK-NEXT: ldr w8, [sp, #192]
-; CHECK-NEXT: strb w9, [sp, #28]
; CHECK-NEXT: ldr w9, [sp, #184]
-; CHECK-NEXT: strb w8, [sp, #27]
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: ldr w10, [sp, #160]
+; CHECK-NEXT: ldr w11, [sp, #144]
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ldr w8, [sp, #176]
-; CHECK-NEXT: strb w9, [sp, #26]
+; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: ldr w9, [sp, #168]
-; CHECK-NEXT: strb w8, [sp, #25]
-; CHECK-NEXT: ldr w8, [sp, #160]
-; CHECK-NEXT: strb w9, [sp, #24]
-; CHECK-NEXT: ldr w9, [sp, #152]
-; CHECK-NEXT: strb w8, [sp, #23]
-; CHECK-NEXT: ldr w8, [sp, #144]
-; CHECK-NEXT: strb w9, [sp, #22]
+; CHECK-NEXT: fmov s3, w10
+; CHECK-NEXT: fmov s4, w11
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: ldr w8, [sp, #152]
+; CHECK-NEXT: ldr w10, [sp, #112]
+; CHECK-NEXT: fmov s6, w9
; CHECK-NEXT: ldr w9, [sp, #136]
-; CHECK-NEXT: strb w8, [sp, #21]
+; CHECK-NEXT: ldr w11, [sp, #96]
+; CHECK-NEXT: fmov s5, w8
; CHECK-NEXT: ldr w8, [sp, #128]
-; CHECK-NEXT: strb w9, [sp, #20]
+; CHECK-NEXT: zip1 z0.b, z1.b, z0.b
+; CHECK-NEXT: fmov s7, w9
; CHECK-NEXT: ldr w9, [sp, #120]
-; CHECK-NEXT: strb w8, [sp, #19]
-; CHECK-NEXT: ldr w8, [sp, #112]
-; CHECK-NEXT: strb w9, [sp, #18]
-; CHECK-NEXT: ldr w9, [sp, #104]
-; CHECK-NEXT: strb w8, [sp, #17]
-; CHECK-NEXT: ldr w8, [sp, #96]
-; CHECK-NEXT: strb w9, [sp, #16]
+; CHECK-NEXT: fmov s18, w10
+; CHECK-NEXT: fmov s16, w8
+; CHECK-NEXT: ldr w8, [sp, #104]
+; CHECK-NEXT: zip1 z2.b, z6.b, z2.b
+; CHECK-NEXT: fmov s17, w9
; CHECK-NEXT: ldr w9, [sp, #88]
-; CHECK-NEXT: strb w8, [sp, #15]
+; CHECK-NEXT: fmov s20, w11
+; CHECK-NEXT: fmov s19, w8
; CHECK-NEXT: ldr w8, [sp, #80]
-; CHECK-NEXT: strb w9, [sp, #14]
+; CHECK-NEXT: ldr w10, [sp, #64]
+; CHECK-NEXT: fmov s21, w9
; CHECK-NEXT: ldr w9, [sp, #72]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: ldr w8, [sp, #64]
-; CHECK-NEXT: strb w9, [sp, #12]
-; CHECK-NEXT: ldr w9, [sp, #56]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: ldr w8, [sp, #48]
-; CHECK-NEXT: strb w9, [sp, #10]
+; CHECK-NEXT: ldr w11, [sp, #48]
+; CHECK-NEXT: fmov s22, w8
+; CHECK-NEXT: ldr w8, [sp, #56]
+; CHECK-NEXT: zip1 z3.b, z5.b, z3.b
+; CHECK-NEXT: fmov s23, w9
; CHECK-NEXT: ldr w9, [sp, #40]
-; CHECK-NEXT: strb w8, [sp, #9]
+; CHECK-NEXT: zip1 z4.b, z7.b, z4.b
+; CHECK-NEXT: fmov s25, w8
; CHECK-NEXT: ldr w8, [sp, #32]
-; CHECK-NEXT: strb w9, [sp, #8]
-; CHECK-NEXT: strb w8, [sp, #7]
+; CHECK-NEXT: fmov s24, w10
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: ldr w9, [sp, #24]
+; CHECK-NEXT: fmov s26, w11
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: ldr w8, [sp, #16]
+; CHECK-NEXT: zip1 z16.b, z17.b, z16.b
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: ldr w9, [sp, #8]
+; CHECK-NEXT: zip1 z17.b, z19.b, z18.b
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: ldr w8, [sp]
+; CHECK-NEXT: zip1 z19.b, z21.b, z20.b
+; CHECK-NEXT: fmov s18, w9
+; CHECK-NEXT: zip1 z20.b, z23.b, z22.b
+; CHECK-NEXT: fmov s23, w7
+; CHECK-NEXT: fmov s22, w8
+; CHECK-NEXT: zip1 z21.b, z25.b, z24.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z26.b
+; CHECK-NEXT: zip1 z5.b, z5.b, z6.b
+; CHECK-NEXT: fmov s24, w3
+; CHECK-NEXT: fmov s25, w2
+; CHECK-NEXT: zip1 z6.b, z18.b, z7.b
+; CHECK-NEXT: fmov s18, w6
+; CHECK-NEXT: fmov s26, w1
+; CHECK-NEXT: zip1 z7.b, z23.b, z22.b
+; CHECK-NEXT: fmov s22, w5
+; CHECK-NEXT: fmov s23, w4
+; CHECK-NEXT: zip1 z0.h, z2.h, z0.h
+; CHECK-NEXT: zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z4.h, z20.h, z19.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z21.h
+; CHECK-NEXT: zip1 z5.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z18.b, z22.b, z18.b
+; CHECK-NEXT: zip1 z22.b, z24.b, z23.b
; CHECK-NEXT: mov w8, #16 // =0x10
-; CHECK-NEXT: strb w6, [sp, #5]
-; CHECK-NEXT: strb w5, [sp, #4]
-; CHECK-NEXT: strb w4, [sp, #3]
-; CHECK-NEXT: strb w3, [sp, #2]
-; CHECK-NEXT: strb w2, [sp, #1]
-; CHECK-NEXT: strb w1, [sp]
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: zip1 z23.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z0.s, z2.s, z0.s
+; CHECK-NEXT: zip1 z2.s, z4.s, z3.s
+; CHECK-NEXT: zip1 z1.s, z5.s, z1.s
+; CHECK-NEXT: zip1 z6.h, z18.h, z7.h
+; CHECK-NEXT: zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT: zip1 z0.d, z2.d, z0.d
+; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
; CHECK-NEXT: lsl z0.b, z0.b, #7
-; CHECK-NEXT: lsl z1.b, z1.b, #7
+; CHECK-NEXT: zip1 z1.d, z3.d, z1.d
; CHECK-NEXT: asr z0.b, z0.b, #7
-; CHECK-NEXT: asr z1.b, z1.b, #7
+; CHECK-NEXT: lsl z1.b, z1.b, #7
; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: asr z1.b, z1.b, #7
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x8]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v32i8:
@@ -1466,23 +1492,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
; CHECK-LABEL: masked_load_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v2f16:
@@ -2318,33 +2339,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
; CHECK-LABEL: masked_load_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z1.b, z0.b[3]
; CHECK-NEXT: mov z2.b, z0.b[2]
+; CHECK-NEXT: mov x8, #4 // =0x4
; CHECK-NEXT: mov z3.b, z0.b[1]
; CHECK-NEXT: mov z4.b, z0.b[7]
-; CHECK-NEXT: strh w8, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.b, z0.b[6]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[5]
-; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #4]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w9, [sp, #14]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: mov x8, #4 // =0x4
-; CHECK-NEXT: ldp d0, d1, [sp]
+; CHECK-NEXT: mov z5.b, z0.b[6]
+; CHECK-NEXT: mov z6.b, z0.b[5]
+; CHECK-NEXT: mov z7.b, z0.b[4]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z2.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z1.s, z3.s, z2.s
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: lsl z0.s, z0.s, #31
@@ -2357,7 +2366,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v8f32:
@@ -2684,23 +2692,22 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-LABEL: masked_load_zext_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: strh w3, [sp, #12]
+; CHECK-NEXT: fmov s0, w2
+; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI13_0
+; CHECK-NEXT: fmov s2, w3
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: strh w2, [sp, #10]
-; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: strh w1, [sp, #8]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: and z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
@@ -2759,23 +2766,22 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-LABEL: masked_load_sext_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: strh w3, [sp, #12]
+; CHECK-NEXT: fmov s0, w2
+; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI14_0
+; CHECK-NEXT: fmov s2, w3
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: strh w2, [sp, #10]
-; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT: strh w1, [sp, #8]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: and z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 0c3411e5f55148..2966ab12b8cad6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -293,78 +293,104 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
; CHECK-LABEL: masked_store_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldr w8, [sp, #96]
-; CHECK-NEXT: ldr w9, [sp, #88]
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: ldr w10, [sp, #120]
-; CHECK-NEXT: strb w7, [sp, #6]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: ldr w8, [sp, #80]
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: ldr w9, [sp, #72]
-; CHECK-NEXT: strb w8, [sp, #13]
; CHECK-NEXT: ldr w8, [sp, #64]
-; CHECK-NEXT: strb w9, [sp, #12]
; CHECK-NEXT: ldr w9, [sp, #56]
-; CHECK-NEXT: strb w8, [sp, #11]
+; CHECK-NEXT: fmov s26, w2
+; CHECK-NEXT: ldr w10, [sp, #32]
+; CHECK-NEXT: ldr w11, [sp, #16]
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ldr w8, [sp, #48]
-; CHECK-NEXT: strb w9, [sp, #10]
+; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: ldr w9, [sp, #40]
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: ldr w8, [sp, #32]
-; CHECK-NEXT: strb w9, [sp, #8]
-; CHECK-NEXT: ldr w9, [sp, #216]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: ldr w8, [sp, #224]
-; CHECK-NEXT: strb w9, [sp, #30]
-; CHECK-NEXT: ldr w9, [sp, #200]
-; CHECK-NEXT: strb w8, [sp, #31]
-; CHECK-NEXT: ldr w8, [sp, #208]
-; CHECK-NEXT: strb w9, [sp, #28]
+; CHECK-NEXT: fmov s5, w10
+; CHECK-NEXT: fmov s7, w11
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: ldr w8, [sp, #24]
+; CHECK-NEXT: ldr w10, [sp, #176]
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: ldr w9, [sp, #8]
+; CHECK-NEXT: ldr w11, [sp, #168]
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: ldr w8, [sp]
+; CHECK-NEXT: fmov s19, w10
+; CHECK-NEXT: fmov s16, w9
; CHECK-NEXT: ldr w9, [sp, #184]
-; CHECK-NEXT: strb w8, [sp, #29]
+; CHECK-NEXT: fmov s20, w11
+; CHECK-NEXT: zip1 z4.b, z3.b, z2.b
+; CHECK-NEXT: fmov s3, w8
; CHECK-NEXT: ldr w8, [sp, #192]
-; CHECK-NEXT: strb w9, [sp, #26]
-; CHECK-NEXT: ldr w9, [sp, #168]
-; CHECK-NEXT: strb w8, [sp, #27]
-; CHECK-NEXT: ldr w8, [sp, #176]
-; CHECK-NEXT: strb w9, [sp, #24]
+; CHECK-NEXT: fmov s18, w9
; CHECK-NEXT: ldr w9, [sp, #152]
-; CHECK-NEXT: strb w8, [sp, #25]
+; CHECK-NEXT: ldr w10, [sp, #136]
+; CHECK-NEXT: fmov s17, w8
; CHECK-NEXT: ldr w8, [sp, #160]
-; CHECK-NEXT: strb w9, [sp, #22]
-; CHECK-NEXT: ldr w9, [sp, #136]
-; CHECK-NEXT: strb w8, [sp, #23]
+; CHECK-NEXT: ldr w11, [sp, #120]
+; CHECK-NEXT: fmov s21, w10
+; CHECK-NEXT: ldr w10, [sp, #88]
+; CHECK-NEXT: zip1 z1.b, z1.b, z0.b
+; CHECK-NEXT: fmov s23, w11
+; CHECK-NEXT: ldr w11, [sp, #72]
+; CHECK-NEXT: zip1 z0.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z17.b, z18.b, z17.b
+; CHECK-NEXT: zip1 z18.b, z20.b, z19.b
+; CHECK-NEXT: fmov s19, w8
+; CHECK-NEXT: fmov s20, w9
; CHECK-NEXT: ldr w8, [sp, #144]
-; CHECK-NEXT: strb w9, [sp, #20]
-; CHECK-NEXT: ldr w9, [sp, #112]
-; CHECK-NEXT: strb w8, [sp, #21]
-; CHECK-NEXT: ldr w8, [sp, #128]
-; CHECK-NEXT: strb w6, [sp, #5]
-; CHECK-NEXT: strb w8, [sp, #19]
-; CHECK-NEXT: ldr w8, [sp, #104]
-; CHECK-NEXT: strb w5, [sp, #4]
-; CHECK-NEXT: strb w4, [sp, #3]
-; CHECK-NEXT: strb w3, [sp, #2]
-; CHECK-NEXT: strb w2, [sp, #1]
-; CHECK-NEXT: strb w1, [sp]
-; CHECK-NEXT: strb w10, [sp, #18]
-; CHECK-NEXT: strb w9, [sp, #17]
-; CHECK-NEXT: strb w8, [sp, #16]
+; CHECK-NEXT: ldr w9, [sp, #128]
+; CHECK-NEXT: fmov s24, w10
+; CHECK-NEXT: fmov s5, w7
+; CHECK-NEXT: fmov s25, w11
+; CHECK-NEXT: fmov s22, w9
+; CHECK-NEXT: ldr w9, [sp, #104]
+; CHECK-NEXT: zip1 z2.b, z16.b, z7.b
+; CHECK-NEXT: zip1 z19.b, z20.b, z19.b
+; CHECK-NEXT: fmov s20, w8
+; CHECK-NEXT: ldr w8, [sp, #112]
+; CHECK-NEXT: zip1 z3.b, z5.b, z3.b
+; CHECK-NEXT: fmov s5, w6
+; CHECK-NEXT: fmov s6, w5
+; CHECK-NEXT: fmov s7, w4
+; CHECK-NEXT: fmov s16, w3
+; CHECK-NEXT: zip1 z1.h, z4.h, z1.h
+; CHECK-NEXT: zip1 z20.b, z21.b, z20.b
+; CHECK-NEXT: zip1 z21.b, z23.b, z22.b
+; CHECK-NEXT: fmov s22, w8
+; CHECK-NEXT: fmov s23, w9
+; CHECK-NEXT: ldr w8, [sp, #96]
+; CHECK-NEXT: ldr w9, [sp, #80]
+; CHECK-NEXT: zip1 z5.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z6.b, z16.b, z7.b
+; CHECK-NEXT: zip1 z4.h, z18.h, z17.h
+; CHECK-NEXT: zip1 z16.h, z20.h, z19.h
+; CHECK-NEXT: zip1 z0.h, z2.h, z0.h
+; CHECK-NEXT: zip1 z22.b, z23.b, z22.b
+; CHECK-NEXT: fmov s23, w8
; CHECK-NEXT: mov w8, #16 // =0x10
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: zip1 z2.h, z5.h, z3.h
+; CHECK-NEXT: zip1 z4.s, z16.s, z4.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT: fmov s24, w9
+; CHECK-NEXT: zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT: zip1 z24.b, z25.b, z24.b
+; CHECK-NEXT: fmov s25, w1
+; CHECK-NEXT: zip1 z7.b, z25.b, z26.b
+; CHECK-NEXT: zip1 z18.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT: zip1 z5.s, z18.s, z17.s
+; CHECK-NEXT: zip1 z1.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z2.d, z5.d, z4.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: lsl z1.b, z2.b, #7
; CHECK-NEXT: lsl z0.b, z0.b, #7
-; CHECK-NEXT: lsl z1.b, z1.b, #7
-; CHECK-NEXT: asr z0.b, z0.b, #7
; CHECK-NEXT: asr z1.b, z1.b, #7
-; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT: asr z0.b, z0.b, #7
+; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: mov z0.b, #0 // =0x0
; CHECK-NEXT: st1b { z0.b }, p1, [x0, x8]
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v32i8:
@@ -589,23 +615,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: mov z0.h, #0 // =0x0
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v2f16:
@@ -1014,48 +1035,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
; CHECK-LABEL: masked_store_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
+; CHECK-NEXT: mov x8, #4 // =0x4
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z4.b, z0.b[4]
+; CHECK-NEXT: mov z5.b, z0.b[3]
+; CHECK-NEXT: mov z6.b, z0.b[2]
+; CHECK-NEXT: mov z7.b, z0.b[1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: mov z4.b, z0.b[1]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: mov x8, #4 // =0x4
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z7.h
+; CHECK-NEXT: zip1 z1.s, z2.s, z1.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: lsl z1.s, z1.s, #31
+; CHECK-NEXT: lsl z0.s, z0.s, #31
; CHECK-NEXT: asr z1.s, z1.s, #31
+; CHECK-NEXT: asr z0.s, z0.s, #31
; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0
; CHECK-NEXT: mov z1.s, #0 // =0x0
-; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr d0, [sp]
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: lsl z0.s, z0.s, #31
-; CHECK-NEXT: asr z0.s, z0.s, #31
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: st1w { z1.s }, p0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v8f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index b91f813c5141bb..620e791c77e89f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu"
define void @zip1_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: mov z3.b, z0.b[14]
-; CHECK-NEXT: mov z4.b, z0.b[13]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.b, z0.b[11]
-; CHECK-NEXT: mov z2.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[10]
-; CHECK-NEXT: strb w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[8]
-; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: mov z4.b, z0.b[14]
+; CHECK-NEXT: mov z6.b, z0.b[13]
+; CHECK-NEXT: mov z3.b, z1.b[15]
+; CHECK-NEXT: mov z5.b, z1.b[14]
+; CHECK-NEXT: mov z7.b, z1.b[13]
+; CHECK-NEXT: mov z16.b, z0.b[12]
+; CHECK-NEXT: mov z17.b, z1.b[12]
+; CHECK-NEXT: mov z18.b, z0.b[11]
+; CHECK-NEXT: mov z19.b, z1.b[11]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: mov z21.b, z1.b[10]
+; CHECK-NEXT: mov z22.b, z0.b[9]
+; CHECK-NEXT: mov z23.b, z1.b[9]
+; CHECK-NEXT: mov z24.b, z0.b[8]
+; CHECK-NEXT: mov z25.b, z1.b[8]
+; CHECK-NEXT: zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT: zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT: zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT: zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT: zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT: zip1 z16.b, z22.b, z23.b
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[9]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[8]
-; CHECK-NEXT: strb w9, [sp, #5]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z4.h, z7.h, z6.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v32i8:
@@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-LABEL: zip_v32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: ldp q0, q4, [x0]
-; CHECK-NEXT: ldp q2, q5, [x0, #32]
-; CHECK-NEXT: mov z16.h, z3.h[7]
-; CHECK-NEXT: mov z18.h, z3.h[6]
-; CHECK-NEXT: mov z17.h, z4.h[7]
-; CHECK-NEXT: ldp q6, q7, [x1, #32]
-; CHECK-NEXT: mov z19.h, z4.h[6]
-; CHECK-NEXT: fmov w8, s16
+; CHECK-NEXT: .cfi_offset b8, -8
+; CHECK-NEXT: .cfi_offset b9, -16
+; CHECK-NEXT: .cfi_offset b10, -24
+; CHECK-NEXT: .cfi_offset b11, -32
+; CHECK-NEXT: .cfi_offset b12, -40
+; CHECK-NEXT: .cfi_offset b13, -48
+; CHECK-NEXT: .cfi_offset b14, -56
+; CHECK-NEXT: .cfi_offset b15, -64
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ldp q2, q3, [x1]
+; CHECK-NEXT: mov z5.h, z1.h[7]
+; CHECK-NEXT: mov z7.h, z1.h[6]
+; CHECK-NEXT: mov z17.h, z1.h[5]
+; CHECK-NEXT: mov z4.h, z3.h[7]
+; CHECK-NEXT: mov z6.h, z3.h[6]
; CHECK-NEXT: mov z16.h, z3.h[5]
-; CHECK-NEXT: fmov w9, s17
-; CHECK-NEXT: mov z17.h, z4.h[5]
-; CHECK-NEXT: mov z20.h, z7.h[6]
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s18
+; CHECK-NEXT: mov z20.h, z2.h[7]
+; CHECK-NEXT: mov z21.h, z0.h[7]
; CHECK-NEXT: mov z18.h, z3.h[4]
-; CHECK-NEXT: strh w9, [sp, #28]
-; CHECK-NEXT: fmov w9, s19
-; CHECK-NEXT: mov z19.h, z5.h[7]
-; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z4.h[4]
-; CHECK-NEXT: strh w9, [sp, #24]
-; CHECK-NEXT: zip1 z4.h, z5.h, z7.h
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z1.h[7]
-; CHECK-NEXT: add z3.h, z3.h, z4.h
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z0.h[7]
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z1.h[6]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z1.h[4]
-; CHECK-NEXT: strh w8, [sp, #56]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z0.h[4]
-; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: zip1 z1.h, z2.h, z6.h
-; CHECK-NEXT: strh w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: ldr q16, [sp, #16]
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: strh w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z7.h[7]
-; CHECK-NEXT: strh w8, [sp, #48]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z5.h[6]
-; CHECK-NEXT: ldr q17, [sp, #48]
-; CHECK-NEXT: strh w8, [sp, #46]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z7.h[5]
-; CHECK-NEXT: strh w8, [sp, #44]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z5.h[5]
-; CHECK-NEXT: strh w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z7.h[4]
-; CHECK-NEXT: strh w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z5.h[4]
-; CHECK-NEXT: strh w8, [sp, #38]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z6.h[7]
-; CHECK-NEXT: strh w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z2.h[7]
-; CHECK-NEXT: strh w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z6.h[6]
-; CHECK-NEXT: strh w8, [sp, #32]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z2.h[6]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z6.h[5]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z2.h[5]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z6.h[4]
-; CHECK-NEXT: fmov w9, s19
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z2.h[4]
-; CHECK-NEXT: strh w9, [sp, #4]
-; CHECK-NEXT: ldr q2, [sp, #32]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: fmov w9, s18
-; CHECK-NEXT: add z2.h, z16.h, z2.h
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: ldr q4, [sp]
-; CHECK-NEXT: stp q3, q2, [x0, #32]
-; CHECK-NEXT: add z1.h, z17.h, z4.h
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov z19.h, z1.h[4]
+; CHECK-NEXT: mov z22.h, z2.h[6]
+; CHECK-NEXT: mov z23.h, z0.h[6]
+; CHECK-NEXT: zip1 z24.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z25.h, z7.h, z6.h
+; CHECK-NEXT: zip1 z17.h, z17.h, z16.h
+; CHECK-NEXT: ldp q4, q6, [x0, #32]
+; CHECK-NEXT: zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT: ldp q5, q7, [x1, #32]
+; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z19.s, z25.s, z24.s
+; CHECK-NEXT: zip1 z22.h, z23.h, z22.h
+; CHECK-NEXT: mov z23.h, z2.h[5]
+; CHECK-NEXT: mov z21.h, z6.h[7]
+; CHECK-NEXT: mov z24.h, z0.h[5]
+; CHECK-NEXT: mov z25.h, z2.h[4]
+; CHECK-NEXT: mov z20.h, z7.h[7]
+; CHECK-NEXT: mov z26.h, z0.h[4]
+; CHECK-NEXT: mov z27.h, z6.h[6]
+; CHECK-NEXT: mov z28.h, z7.h[5]
+; CHECK-NEXT: mov z29.h, z6.h[5]
+; CHECK-NEXT: mov z30.h, z7.h[4]
+; CHECK-NEXT: mov z31.h, z6.h[4]
+; CHECK-NEXT: mov z8.h, z5.h[7]
+; CHECK-NEXT: mov z9.h, z4.h[7]
+; CHECK-NEXT: zip1 z20.h, z21.h, z20.h
+; CHECK-NEXT: mov z21.h, z7.h[6]
+; CHECK-NEXT: mov z10.h, z5.h[6]
+; CHECK-NEXT: mov z11.h, z4.h[6]
+; CHECK-NEXT: mov z12.h, z5.h[5]
+; CHECK-NEXT: mov z13.h, z4.h[5]
+; CHECK-NEXT: mov z14.h, z5.h[4]
+; CHECK-NEXT: mov z15.h, z4.h[4]
+; CHECK-NEXT: zip1 z23.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z21.h, z27.h, z21.h
+; CHECK-NEXT: zip1 z27.h, z29.h, z28.h
+; CHECK-NEXT: zip1 z28.h, z31.h, z30.h
+; CHECK-NEXT: zip1 z24.h, z26.h, z25.h
+; CHECK-NEXT: zip1 z25.h, z9.h, z8.h
+; CHECK-NEXT: zip1 z26.h, z11.h, z10.h
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z29.h, z13.h, z12.h
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z30.h, z15.h, z14.h
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z17.s, z18.s, z17.s
+; CHECK-NEXT: zip1 z18.s, z21.s, z20.s
+; CHECK-NEXT: zip1 z20.s, z28.s, z27.s
+; CHECK-NEXT: zip1 z16.s, z22.s, z16.s
+; CHECK-NEXT: zip1 z21.s, z24.s, z23.s
+; CHECK-NEXT: zip1 z1.h, z1.h, z3.h
+; CHECK-NEXT: zip1 z3.s, z26.s, z25.s
+; CHECK-NEXT: zip1 z22.s, z30.s, z29.s
+; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z7.d, z17.d, z19.d
+; CHECK-NEXT: zip1 z17.d, z20.d, z18.d
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z2.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.d, z21.d, z16.d
+; CHECK-NEXT: zip1 z3.d, z22.d, z3.d
+; CHECK-NEXT: add z1.h, z1.h, z6.h
+; CHECK-NEXT: add z5.h, z7.h, z17.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: add z2.h, z4.h, z3.h
+; CHECK-NEXT: stp q1, q5, [x0, #32]
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip_v32i16:
@@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
define void @zip1_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: mov z3.h, z0.h[6]
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[4]
-; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov z4.h, z0.h[6]
+; CHECK-NEXT: mov z6.h, z0.h[5]
; CHECK-NEXT: mov z3.h, z1.h[7]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z7.h, z1.h[5]
+; CHECK-NEXT: mov z16.h, z0.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[4]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z17.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v16i16:
@@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
define void @zip1_v8i32(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
@@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z4.s, z5.s
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v8i32:
@@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) {
define void @zip_v4i32(ptr %a, ptr %b) {
; CHECK-LABEL: zip_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x1]
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z3.s, z1.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w9, w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w9, w8, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip_v4i32:
@@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) {
define void @zip2_v32i8(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: mov z3.b, z0.b[14]
-; CHECK-NEXT: mov z4.b, z0.b[13]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.b, z0.b[11]
-; CHECK-NEXT: mov z2.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[10]
-; CHECK-NEXT: strb w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[8]
-; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: mov z4.b, z0.b[14]
+; CHECK-NEXT: mov z6.b, z0.b[13]
+; CHECK-NEXT: mov z3.b, z1.b[15]
+; CHECK-NEXT: mov z5.b, z1.b[14]
+; CHECK-NEXT: mov z7.b, z1.b[13]
+; CHECK-NEXT: mov z16.b, z0.b[12]
+; CHECK-NEXT: mov z17.b, z1.b[12]
+; CHECK-NEXT: mov z18.b, z0.b[11]
+; CHECK-NEXT: mov z19.b, z1.b[11]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: mov z21.b, z1.b[10]
+; CHECK-NEXT: mov z22.b, z0.b[9]
+; CHECK-NEXT: mov z23.b, z1.b[9]
+; CHECK-NEXT: mov z24.b, z0.b[8]
+; CHECK-NEXT: mov z25.b, z1.b[8]
+; CHECK-NEXT: zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT: zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT: zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT: zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT: zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT: zip1 z16.b, z22.b, z23.b
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[9]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[8]
-; CHECK-NEXT: strb w9, [sp, #5]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z4.h, z7.h, z6.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v32i8:
@@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
define void @zip2_v16i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: mov z3.h, z0.h[6]
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[4]
-; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov z4.h, z0.h[6]
+; CHECK-NEXT: mov z6.h, z0.h[5]
; CHECK-NEXT: mov z3.h, z1.h[7]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z7.h, z1.h[5]
+; CHECK-NEXT: mov z16.h, z0.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[4]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z17.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v16i16:
@@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
define void @zip2_v8i32(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
@@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z4.s, z5.s
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v8i32:
@@ -1547,197 +1432,144 @@ define void @zip2_v8i32_undef(ptr %a) #0{
define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q2, q3, [x0]
-; CHECK-NEXT: ldp q0, q1, [x1]
-; CHECK-NEXT: mov z4.b, z3.b[14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z6.b, z3.b[10]
-; CHECK-NEXT: mov z5.b, z3.b[12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z7.b, z3.b[8]
-; CHECK-NEXT: mov z17.b, z3.b[9]
-; CHECK-NEXT: mov z18.b, z3.b[7]
-; CHECK-NEXT: mov z16.b, z3.b[11]
-; CHECK-NEXT: strb w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z3.b[6]
-; CHECK-NEXT: strb w9, [sp, #32]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.b, z3.b[4]
-; CHECK-NEXT: strb w8, [sp, #47]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[2]
-; CHECK-NEXT: strb w9, [sp, #46]
-; CHECK-NEXT: fmov w9, s7
-; CHECK-NEXT: mov z7.b, z2.b[14]
-; CHECK-NEXT: strb w8, [sp, #45]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z2.b[12]
-; CHECK-NEXT: strb w9, [sp, #44]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: mov z16.b, z2.b[11]
-; CHECK-NEXT: strb w8, [sp, #43]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z2.b[10]
-; CHECK-NEXT: strb w9, [sp, #61]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: strb w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z2.b[8]
-; CHECK-NEXT: strb w9, [sp, #53]
-; CHECK-NEXT: strb w8, [sp, #41]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[6]
-; CHECK-NEXT: strb w8, [sp, #39]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z2.b[4]
-; CHECK-NEXT: strb w8, [sp, #38]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z2.b[2]
-; CHECK-NEXT: strb w8, [sp, #37]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[8]
-; CHECK-NEXT: strb w8, [sp, #35]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #33]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[6]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z1.b[4]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[2]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z0.b[14]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z0.b[10]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[8]
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z0.b[6]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[4]
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z0.b[2]
-; CHECK-NEXT: strb w8, [sp, #5]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z3.b[13]
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: ldr q4, [sp, #32]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[5]
-; CHECK-NEXT: mov z3.b, z3.b[3]
-; CHECK-NEXT: ldr q5, [sp]
-; CHECK-NEXT: strb w8, [sp, #63]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[13]
-; CHECK-NEXT: strb w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: strb w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: strb w8, [sp, #59]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z2.b[9]
-; CHECK-NEXT: strb w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z2.b[5]
-; CHECK-NEXT: strb w8, [sp, #57]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[3]
+; CHECK-NEXT: .cfi_offset b8, -8
+; CHECK-NEXT: .cfi_offset b9, -16
+; CHECK-NEXT: .cfi_offset b10, -24
+; CHECK-NEXT: .cfi_offset b11, -32
+; CHECK-NEXT: .cfi_offset b12, -40
+; CHECK-NEXT: .cfi_offset b13, -48
+; CHECK-NEXT: .cfi_offset b14, -64
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: mov z2.b, z1.b[14]
+; CHECK-NEXT: mov z3.b, z1.b[12]
+; CHECK-NEXT: mov z4.b, z1.b[10]
+; CHECK-NEXT: mov z5.b, z1.b[8]
+; CHECK-NEXT: mov z7.b, z1.b[6]
+; CHECK-NEXT: mov z16.b, z1.b[4]
+; CHECK-NEXT: mov z17.b, z1.b[2]
+; CHECK-NEXT: mov z18.b, z0.b[14]
+; CHECK-NEXT: mov z20.b, z0.b[12]
+; CHECK-NEXT: zip1 z3.b, z3.b, z2.b
+; CHECK-NEXT: ldp q2, q19, [x1]
+; CHECK-NEXT: zip1 z6.b, z5.b, z4.b
+; CHECK-NEXT: zip1 z4.b, z16.b, z7.b
+; CHECK-NEXT: mov z16.b, z0.b[10]
+; CHECK-NEXT: zip1 z5.b, z1.b, z17.b
+; CHECK-NEXT: zip1 z7.b, z20.b, z18.b
+; CHECK-NEXT: mov z17.b, z0.b[8]
+; CHECK-NEXT: mov z18.b, z0.b[6]
+; CHECK-NEXT: mov z20.b, z0.b[4]
+; CHECK-NEXT: mov z21.b, z0.b[2]
+; CHECK-NEXT: mov z22.b, z19.b[14]
+; CHECK-NEXT: mov z23.b, z19.b[12]
+; CHECK-NEXT: mov z24.b, z19.b[10]
+; CHECK-NEXT: mov z25.b, z19.b[8]
+; CHECK-NEXT: mov z26.b, z19.b[6]
+; CHECK-NEXT: mov z27.b, z19.b[4]
+; CHECK-NEXT: mov z28.b, z19.b[2]
+; CHECK-NEXT: mov z29.b, z2.b[14]
+; CHECK-NEXT: mov z30.b, z2.b[12]
+; CHECK-NEXT: mov z31.b, z2.b[10]
+; CHECK-NEXT: mov z8.b, z2.b[8]
+; CHECK-NEXT: zip1 z16.b, z17.b, z16.b
+; CHECK-NEXT: zip1 z17.b, z20.b, z18.b
+; CHECK-NEXT: zip1 z18.b, z0.b, z21.b
+; CHECK-NEXT: zip1 z20.b, z23.b, z22.b
+; CHECK-NEXT: zip1 z21.b, z25.b, z24.b
+; CHECK-NEXT: zip1 z22.b, z27.b, z26.b
+; CHECK-NEXT: zip1 z23.b, z19.b, z28.b
+; CHECK-NEXT: zip1 z24.b, z30.b, z29.b
+; CHECK-NEXT: zip1 z25.b, z8.b, z31.b
+; CHECK-NEXT: zip1 z3.h, z6.h, z3.h
+; CHECK-NEXT: zip1 z4.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT: zip1 z7.h, z18.h, z17.h
+; CHECK-NEXT: zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z17.h, z23.h, z22.h
+; CHECK-NEXT: mov z21.b, z19.b[15]
+; CHECK-NEXT: zip1 z6.h, z25.h, z24.h
+; CHECK-NEXT: mov z22.b, z19.b[13]
+; CHECK-NEXT: mov z23.b, z19.b[11]
+; CHECK-NEXT: mov z24.b, z19.b[9]
+; CHECK-NEXT: mov z26.b, z2.b[6]
+; CHECK-NEXT: mov z27.b, z2.b[4]
+; CHECK-NEXT: mov z20.b, z2.b[2]
+; CHECK-NEXT: mov z25.b, z19.b[7]
+; CHECK-NEXT: mov z28.b, z19.b[1]
+; CHECK-NEXT: zip1 z21.b, z22.b, z21.b
+; CHECK-NEXT: mov z29.b, z2.b[15]
+; CHECK-NEXT: mov z30.b, z2.b[13]
+; CHECK-NEXT: zip1 z22.b, z24.b, z23.b
+; CHECK-NEXT: mov z23.b, z1.b[15]
+; CHECK-NEXT: mov z24.b, z1.b[13]
+; CHECK-NEXT: zip1 z18.b, z27.b, z26.b
+; CHECK-NEXT: mov z26.b, z19.b[5]
+; CHECK-NEXT: mov z27.b, z19.b[3]
+; CHECK-NEXT: mov z31.b, z1.b[9]
+; CHECK-NEXT: zip1 z20.b, z2.b, z20.b
+; CHECK-NEXT: mov z8.b, z1.b[7]
+; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT: mov z24.b, z1.b[11]
+; CHECK-NEXT: mov z9.b, z1.b[5]
+; CHECK-NEXT: zip1 z19.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z25.b, z28.b, z27.b
+; CHECK-NEXT: zip1 z26.b, z30.b, z29.b
+; CHECK-NEXT: mov z27.b, z2.b[11]
+; CHECK-NEXT: mov z28.b, z2.b[9]
+; CHECK-NEXT: mov z29.b, z2.b[7]
+; CHECK-NEXT: mov z30.b, z2.b[5]
+; CHECK-NEXT: mov z10.b, z0.b[11]
+; CHECK-NEXT: mov z11.b, z0.b[9]
+; CHECK-NEXT: mov z12.b, z0.b[3]
+; CHECK-NEXT: mov z13.b, z0.b[1]
+; CHECK-NEXT: mov z1.b, z1.b[3]
+; CHECK-NEXT: mov z14.b, z0.b[13]
+; CHECK-NEXT: mov z0.b, z0.b[5]
+; CHECK-NEXT: zip1 z24.b, z31.b, z24.b
+; CHECK-NEXT: mov z31.b, z2.b[3]
; CHECK-NEXT: mov z2.b, z2.b[1]
-; CHECK-NEXT: strb w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #49]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[9]
-; CHECK-NEXT: strb w8, [sp, #48]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[7]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: strb w8, [sp, #31]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[5]
-; CHECK-NEXT: strb w9, [sp, #28]
-; CHECK-NEXT: strb w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[3]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: strb w8, [sp, #29]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[11]
-; CHECK-NEXT: strb w8, [sp, #27]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[13]
-; CHECK-NEXT: strb w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: strb w8, [sp, #25]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z0.b[7]
-; CHECK-NEXT: strb w8, [sp, #23]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[5]
-; CHECK-NEXT: strb w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[3]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: strb w8, [sp, #21]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #19]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strb w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strb w8, [sp, #17]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ldr q0, [sp, #48]
-; CHECK-NEXT: add z0.b, z4.b, z0.b
-; CHECK-NEXT: strb w8, [sp, #16]
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: add z1.b, z5.b, z1.b
+; CHECK-NEXT: zip1 z8.b, z9.b, z8.b
+; CHECK-NEXT: zip1 z9.b, z11.b, z10.b
+; CHECK-NEXT: zip1 z10.b, z13.b, z12.b
+; CHECK-NEXT: zip1 z27.b, z28.b, z27.b
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z1.b, z0.b, z1.b
+; CHECK-NEXT: zip1 z11.b, z14.b, z0.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT: zip1 z28.b, z30.b, z29.b
+; CHECK-NEXT: zip1 z2.b, z2.b, z31.b
+; CHECK-NEXT: zip1 z18.h, z20.h, z18.h
+; CHECK-NEXT: zip1 z20.h, z22.h, z21.h
+; CHECK-NEXT: zip1 z21.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z8.h
+; CHECK-NEXT: zip1 z19.h, z25.h, z19.h
+; CHECK-NEXT: zip1 z22.h, z9.h, z11.h
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z0.h, z10.h, z0.h
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z23.h, z27.h, z26.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT: zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT: zip1 z5.s, z17.s, z16.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z21.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z22.s
+; CHECK-NEXT: zip1 z6.s, z18.s, z6.s
+; CHECK-NEXT: zip1 z7.s, z19.s, z20.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z23.s
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: zip1 z1.d, z6.d, z5.d
+; CHECK-NEXT: zip1 z2.d, z2.d, z7.d
+; CHECK-NEXT: add z0.b, z3.b, z0.b
+; CHECK-NEXT: add z1.b, z1.b, z2.b
; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1922,110 +1754,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q2, q3, [x0]
-; CHECK-NEXT: ldp q0, q1, [x1]
-; CHECK-NEXT: mov z4.h, z3.h[6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z6.h, z3.h[2]
-; CHECK-NEXT: mov z5.h, z3.h[4]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z7.h, z2.h[6]
-; CHECK-NEXT: mov z17.h, z2.h[7]
-; CHECK-NEXT: mov z16.h, z3.h[1]
-; CHECK-NEXT: strh w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z2.h[4]
-; CHECK-NEXT: strh w9, [sp, #32]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.h, z2.h[2]
-; CHECK-NEXT: strh w8, [sp, #46]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z1.h[2]
-; CHECK-NEXT: strh w9, [sp, #44]
-; CHECK-NEXT: fmov w9, s7
-; CHECK-NEXT: mov z7.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #38]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: strh w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp, #56]
-; CHECK-NEXT: strh w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z0.h[4]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[2]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z3.h[7]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.h, z3.h[5]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: ldr q3, [sp, #32]
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z2.h[5]
-; CHECK-NEXT: ldr q4, [sp]
-; CHECK-NEXT: strh w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.h, z1.h[7]
-; CHECK-NEXT: strh w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[1]
-; CHECK-NEXT: strh w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: strh w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z1.h[5]
-; CHECK-NEXT: strh w9, [sp, #48]
-; CHECK-NEXT: strh w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset b8, -16
+; CHECK-NEXT: ldp q1, q6, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: mov z3.h, z6.h[6]
+; CHECK-NEXT: mov z4.h, z6.h[4]
+; CHECK-NEXT: mov z5.h, z6.h[2]
+; CHECK-NEXT: mov z7.h, z1.h[6]
+; CHECK-NEXT: mov z16.h, z1.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[2]
+; CHECK-NEXT: mov z18.h, z2.h[6]
+; CHECK-NEXT: mov z19.h, z2.h[4]
+; CHECK-NEXT: mov z20.h, z2.h[2]
+; CHECK-NEXT: mov z21.h, z0.h[6]
+; CHECK-NEXT: mov z22.h, z0.h[4]
+; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT: zip1 z7.h, z1.h, z17.h
+; CHECK-NEXT: zip1 z16.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z18.h, z2.h, z20.h
+; CHECK-NEXT: mov z19.h, z0.h[2]
+; CHECK-NEXT: zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT: mov z20.h, z6.h[7]
+; CHECK-NEXT: mov z21.h, z6.h[5]
+; CHECK-NEXT: mov z22.h, z6.h[3]
+; CHECK-NEXT: mov z6.h, z6.h[1]
+; CHECK-NEXT: mov z23.h, z1.h[7]
+; CHECK-NEXT: mov z24.h, z1.h[5]
+; CHECK-NEXT: mov z25.h, z1.h[3]
; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: strh w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ldr q0, [sp, #48]
-; CHECK-NEXT: add z0.h, z3.h, z0.h
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: add z1.h, z4.h, z1.h
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov z26.h, z2.h[7]
+; CHECK-NEXT: mov z27.h, z2.h[5]
+; CHECK-NEXT: mov z28.h, z2.h[3]
+; CHECK-NEXT: mov z2.h, z2.h[1]
+; CHECK-NEXT: mov z29.h, z0.h[7]
+; CHECK-NEXT: mov z30.h, z0.h[5]
+; CHECK-NEXT: mov z31.h, z0.h[3]
+; CHECK-NEXT: mov z8.h, z0.h[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z19.h
+; CHECK-NEXT: zip1 z19.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z6.h, z6.h, z22.h
+; CHECK-NEXT: zip1 z20.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z25.h
+; CHECK-NEXT: zip1 z21.h, z27.h, z26.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT: zip1 z22.h, z30.h, z29.h
+; CHECK-NEXT: zip1 z23.h, z8.h, z31.h
+; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT: zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT: zip1 z5.s, z18.s, z16.s
+; CHECK-NEXT: zip1 z6.s, z6.s, z19.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z20.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z17.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z21.s
+; CHECK-NEXT: zip1 z7.s, z23.s, z22.s
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z5.d
+; CHECK-NEXT: zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT: add z1.h, z3.h, z1.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v16i16:
@@ -2116,32 +1909,31 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
define void @uzp_v8f32(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: ldp q2, q0, [x0]
+; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: adrp x8, .LCPI21_0
-; CHECK-NEXT: ldp q4, q1, [x1]
+; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: mov z3.s, z0.s[2]
-; CHECK-NEXT: mov z5.s, z1.s[2]
-; CHECK-NEXT: stp s0, s3, [sp, #24]
-; CHECK-NEXT: mov z3.s, z4.s[2]
-; CHECK-NEXT: stp s5, s2, [sp, #12]
+; CHECK-NEXT: mov z4.s, z0.s[2]
; CHECK-NEXT: mov z5.s, z0.s[3]
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: stp s3, s1, [sp, #4]
-; CHECK-NEXT: mov z1.s, z2.s[1]
-; CHECK-NEXT: str s5, [sp, #44]
-; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT: str s0, [sp, #40]
-; CHECK-NEXT: ldp q3, q2, [sp]
-; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s
-; CHECK-NEXT: str s1, [sp, #32]
-; CHECK-NEXT: ldr q1, [sp, #32]
+; CHECK-NEXT: mov z6.s, z0.s[1]
+; CHECK-NEXT: mov z7.s, z1.s[1]
+; CHECK-NEXT: mov z16.s, z3.s[2]
+; CHECK-NEXT: mov z17.s, z2.s[2]
+; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT: zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: zip1 z3.s, z3.s, z16.s
+; CHECK-NEXT: tbl z2.s, { z2.s }, z6.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
+; CHECK-NEXT: zip1 z5.s, z7.s, z0.s
+; CHECK-NEXT: zip1 z7.s, z0.s, z17.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z1.d, z5.d, z4.d
+; CHECK-NEXT: zip1 z3.d, z7.d, z3.d
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: movprfx z1, z3
; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: stp q1, q0, [x0]
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8f32:
@@ -2231,60 +2023,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
define void @uzp_v8i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov z2.h, z1.h[6]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z4.h, z1.h[2]
-; CHECK-NEXT: mov z6.h, z0.h[4]
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z5.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.h, z1.h[7]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[5]
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strh w9, [sp, #6]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: strh w9, [sp, #24]
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: ldp q3, q0, [sp]
-; CHECK-NEXT: add z0.h, z3.h, z0.h
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: mov z2.h, z0.h[6]
+; CHECK-NEXT: mov z3.h, z0.h[4]
+; CHECK-NEXT: mov z4.h, z0.h[2]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z6.h, z1.h[4]
+; CHECK-NEXT: mov z7.h, z1.h[2]
+; CHECK-NEXT: mov z16.h, z0.h[7]
+; CHECK-NEXT: mov z17.h, z0.h[5]
+; CHECK-NEXT: mov z18.h, z0.h[3]
+; CHECK-NEXT: mov z19.h, z0.h[1]
+; CHECK-NEXT: mov z20.h, z1.h[7]
+; CHECK-NEXT: mov z21.h, z1.h[5]
+; CHECK-NEXT: mov z22.h, z1.h[3]
+; CHECK-NEXT: mov z23.h, z1.h[1]
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z7.h
+; CHECK-NEXT: zip1 z4.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z6.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
+; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8i16:
@@ -2341,31 +2111,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
define void @uzp_v8i32_undef(ptr %a) #0{
; CHECK-LABEL: uzp_v8i32_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.s, z0.s[2]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z3.s, z1.s[2]
-; CHECK-NEXT: mov z4.s, z0.s[3]
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.s, z1.s[3]
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: stp w9, w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: stp w9, w8, [sp, #16]
-; CHECK-NEXT: ldp q0, q1, [sp]
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: mov z2.s, z1.s[2]
+; CHECK-NEXT: mov z3.s, z0.s[2]
+; CHECK-NEXT: mov z4.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z0.s[3]
+; CHECK-NEXT: mov z7.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT: zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: stp q0, q0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 88c83a214c7394..c942f1eca8ebaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
; CHECK-NEXT: mov z1.s, z0.s[3]
-; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: mov z2.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%el0 = extractelement <vscale x 4 x i1> %a, i32 0
%el1 = extractelement <vscale x 4 x i1> %a, i32 1
>From b852c6152e3bdbc3a6dcf9ca1e600ebae0cd08b6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 9 Oct 2024 18:17:10 +0000
Subject: [PATCH 2/3] Handle UNDEF better
---
.../Target/AArch64/AArch64ISelLowering.cpp | 15 +-
.../sve-fixed-length-vector-shuffle-tbl.ll | 27 +-
...sve-streaming-mode-fixed-length-bitcast.ll | 2 -
...-streaming-mode-fixed-length-fp-vselect.ll | 8 +-
...ing-mode-fixed-length-insert-vector-elt.ll | 2 -
...e-streaming-mode-fixed-length-int-to-fp.ll | 20 +-
...-streaming-mode-fixed-length-ld2-alloca.ll | 3 +-
...streaming-mode-fixed-length-masked-load.ll | 6 +-
...g-mode-fixed-length-permute-zip-uzp-trn.ll | 260 +++++++++---------
9 files changed, 162 insertions(+), 181 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6c1c33da4be996..71115705407bd6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14408,16 +14408,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
SmallVector<SDValue, 16> Intermediates =
llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
- DAG.getUNDEF(ZipVT), Op, ZeroI64);
+ SDValue Undef = DAG.getUNDEF(ZipVT);
+ return Op.isUndef() ? Undef
+ : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
+ Undef, Op, ZeroI64);
});
while (Intermediates.size() > 1) {
auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
for (unsigned I = 0; I < Intermediates.size(); I += 2) {
- SDValue Op0 = ToZipVT(Intermediates[I + 0]);
- SDValue Op1 = ToZipVT(Intermediates[I + 1]);
- Intermediates[I / 2] = DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+ SDValue Op0 = Intermediates[I + 0];
+ SDValue Op1 = Intermediates[I + 1];
+ Intermediates[I / 2] = Op1.isUndef()
+ ? Op0
+ : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT,
+ ToZipVT(Op0), ToZipVT(Op1));
}
Intermediates.resize(Intermediates.size() / 2);
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index a22c00c1ebce14..20659cde83ee00 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -236,14 +236,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6]
; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
-; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
@@ -256,14 +255,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6]
; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
@@ -276,14 +274,13 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6]
; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z0.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 172e2454d70283..6644be11a02ba7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,6 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z1.s, z0.s[1]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index e3c89981cb27af..ad5f91a5f39a49 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -9,13 +9,11 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
; CHECK-LABEL: select_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
-; CHECK-NEXT: mov z4.s, z2.s[1]
-; CHECK-NEXT: zip1 z3.h, z0.h, z0.h
+; CHECK-NEXT: mov z3.s, z2.s[1]
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: zip1 z2.h, z2.h, z4.h
-; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
; CHECK-NEXT: lsl z2.h, z2.h, #15
; CHECK-NEXT: asr z2.h, z2.h, #15
; CHECK-NEXT: and z2.h, z2.h, #0x1
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 87e3d0d09817ba..275d13ebfd9491 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -509,8 +509,6 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index f71bfb770b15f4..f9f70d30a757eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1161,14 +1161,12 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: ucvtf_v2i64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: mov z2.d, z0.d[1]
+; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: fmov x9, d1
; CHECK-NEXT: ucvtf h0, x8
-; CHECK-NEXT: ucvtf h2, x9
-; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ucvtf h1, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
@@ -2617,14 +2615,12 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: scvtf_v2i64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: mov z2.d, z0.d[1]
+; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: zip1 z1.h, z0.h, z0.h
-; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: fmov x9, d1
; CHECK-NEXT: scvtf h0, x8
-; CHECK-NEXT: scvtf h2, x9
-; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: scvtf h1, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index ef6b1c9acbf105..613543310f2c31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -70,8 +70,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: mov z2.b, z0.b[1]
; CHECK-NEXT: mov z0.b, z0.b[5]
; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
-; CHECK-NEXT: zip1 z2.h, z0.h, z0.h
-; CHECK-NEXT: zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
; CHECK-NEXT: st1b { z1.h }, p0, [x8]
; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
; CHECK-NEXT: fmov w8, s0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 1b90aed22f9d8d..4980ee4d7f74b7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -2695,10 +2695,9 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-NEXT: fmov s0, w2
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI13_0
-; CHECK-NEXT: fmov s2, w3
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT: zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT: fmov s1, w3
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: and z0.d, z0.d, z1.d
@@ -2769,10 +2768,9 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-NEXT: fmov s0, w2
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI14_0
-; CHECK-NEXT: fmov s2, w3
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT: zip1 z1.h, z2.h, z0.h
+; CHECK-NEXT: fmov s1, w3
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: and z0.d, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 620e791c77e89f..8b296d9fbc215d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1432,136 +1432,131 @@ define void @zip2_v8i32_undef(ptr %a) #0{
define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: stp d13, d12, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset b8, -8
; CHECK-NEXT: .cfi_offset b9, -16
; CHECK-NEXT: .cfi_offset b10, -24
; CHECK-NEXT: .cfi_offset b11, -32
; CHECK-NEXT: .cfi_offset b12, -40
; CHECK-NEXT: .cfi_offset b13, -48
-; CHECK-NEXT: .cfi_offset b14, -64
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: mov z2.b, z1.b[14]
; CHECK-NEXT: mov z3.b, z1.b[12]
; CHECK-NEXT: mov z4.b, z1.b[10]
; CHECK-NEXT: mov z5.b, z1.b[8]
-; CHECK-NEXT: mov z7.b, z1.b[6]
-; CHECK-NEXT: mov z16.b, z1.b[4]
-; CHECK-NEXT: mov z17.b, z1.b[2]
+; CHECK-NEXT: mov z6.b, z1.b[6]
+; CHECK-NEXT: mov z7.b, z1.b[4]
+; CHECK-NEXT: mov z16.b, z1.b[2]
; CHECK-NEXT: mov z18.b, z0.b[14]
-; CHECK-NEXT: mov z20.b, z0.b[12]
+; CHECK-NEXT: mov z19.b, z0.b[12]
; CHECK-NEXT: zip1 z3.b, z3.b, z2.b
-; CHECK-NEXT: ldp q2, q19, [x1]
-; CHECK-NEXT: zip1 z6.b, z5.b, z4.b
-; CHECK-NEXT: zip1 z4.b, z16.b, z7.b
-; CHECK-NEXT: mov z16.b, z0.b[10]
-; CHECK-NEXT: zip1 z5.b, z1.b, z17.b
-; CHECK-NEXT: zip1 z7.b, z20.b, z18.b
-; CHECK-NEXT: mov z17.b, z0.b[8]
-; CHECK-NEXT: mov z18.b, z0.b[6]
-; CHECK-NEXT: mov z20.b, z0.b[4]
-; CHECK-NEXT: mov z21.b, z0.b[2]
-; CHECK-NEXT: mov z22.b, z19.b[14]
-; CHECK-NEXT: mov z23.b, z19.b[12]
-; CHECK-NEXT: mov z24.b, z19.b[10]
-; CHECK-NEXT: mov z25.b, z19.b[8]
-; CHECK-NEXT: mov z26.b, z19.b[6]
-; CHECK-NEXT: mov z27.b, z19.b[4]
-; CHECK-NEXT: mov z28.b, z19.b[2]
-; CHECK-NEXT: mov z29.b, z2.b[14]
-; CHECK-NEXT: mov z30.b, z2.b[12]
-; CHECK-NEXT: mov z31.b, z2.b[10]
-; CHECK-NEXT: mov z8.b, z2.b[8]
-; CHECK-NEXT: zip1 z16.b, z17.b, z16.b
-; CHECK-NEXT: zip1 z17.b, z20.b, z18.b
-; CHECK-NEXT: zip1 z18.b, z0.b, z21.b
-; CHECK-NEXT: zip1 z20.b, z23.b, z22.b
-; CHECK-NEXT: zip1 z21.b, z25.b, z24.b
-; CHECK-NEXT: zip1 z22.b, z27.b, z26.b
-; CHECK-NEXT: zip1 z23.b, z19.b, z28.b
-; CHECK-NEXT: zip1 z24.b, z30.b, z29.b
-; CHECK-NEXT: zip1 z25.b, z8.b, z31.b
-; CHECK-NEXT: zip1 z3.h, z6.h, z3.h
-; CHECK-NEXT: zip1 z4.h, z5.h, z4.h
-; CHECK-NEXT: zip1 z5.h, z16.h, z7.h
-; CHECK-NEXT: zip1 z7.h, z18.h, z17.h
-; CHECK-NEXT: zip1 z16.h, z21.h, z20.h
-; CHECK-NEXT: zip1 z17.h, z23.h, z22.h
-; CHECK-NEXT: mov z21.b, z19.b[15]
-; CHECK-NEXT: zip1 z6.h, z25.h, z24.h
-; CHECK-NEXT: mov z22.b, z19.b[13]
-; CHECK-NEXT: mov z23.b, z19.b[11]
-; CHECK-NEXT: mov z24.b, z19.b[9]
-; CHECK-NEXT: mov z26.b, z2.b[6]
-; CHECK-NEXT: mov z27.b, z2.b[4]
-; CHECK-NEXT: mov z20.b, z2.b[2]
-; CHECK-NEXT: mov z25.b, z19.b[7]
-; CHECK-NEXT: mov z28.b, z19.b[1]
-; CHECK-NEXT: zip1 z21.b, z22.b, z21.b
-; CHECK-NEXT: mov z29.b, z2.b[15]
-; CHECK-NEXT: mov z30.b, z2.b[13]
-; CHECK-NEXT: zip1 z22.b, z24.b, z23.b
-; CHECK-NEXT: mov z23.b, z1.b[15]
-; CHECK-NEXT: mov z24.b, z1.b[13]
-; CHECK-NEXT: zip1 z18.b, z27.b, z26.b
-; CHECK-NEXT: mov z26.b, z19.b[5]
-; CHECK-NEXT: mov z27.b, z19.b[3]
-; CHECK-NEXT: mov z31.b, z1.b[9]
-; CHECK-NEXT: zip1 z20.b, z2.b, z20.b
-; CHECK-NEXT: mov z8.b, z1.b[7]
+; CHECK-NEXT: ldp q2, q17, [x1]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: zip1 z4.b, z5.b, z4.b
+; CHECK-NEXT: zip1 z5.b, z7.b, z6.b
+; CHECK-NEXT: zip1 z6.b, z1.b, z16.b
+; CHECK-NEXT: mov z7.b, z0.b[8]
+; CHECK-NEXT: mov z16.b, z0.b[6]
+; CHECK-NEXT: mov z21.b, z0.b[4]
+; CHECK-NEXT: mov z22.b, z0.b[2]
+; CHECK-NEXT: mov z23.b, z17.b[14]
+; CHECK-NEXT: mov z24.b, z17.b[12]
+; CHECK-NEXT: mov z25.b, z17.b[10]
+; CHECK-NEXT: mov z26.b, z17.b[8]
+; CHECK-NEXT: mov z27.b, z17.b[6]
+; CHECK-NEXT: mov z28.b, z17.b[4]
+; CHECK-NEXT: mov z29.b, z17.b[2]
+; CHECK-NEXT: zip1 z18.b, z19.b, z18.b
+; CHECK-NEXT: zip1 z7.b, z7.b, z20.b
+; CHECK-NEXT: zip1 z16.b, z21.b, z16.b
+; CHECK-NEXT: zip1 z19.b, z0.b, z22.b
+; CHECK-NEXT: zip1 z20.b, z24.b, z23.b
+; CHECK-NEXT: zip1 z21.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z22.b, z28.b, z27.b
+; CHECK-NEXT: mov z24.b, z2.b[14]
+; CHECK-NEXT: mov z25.b, z2.b[12]
+; CHECK-NEXT: mov z26.b, z2.b[10]
+; CHECK-NEXT: mov z27.b, z2.b[8]
+; CHECK-NEXT: zip1 z23.b, z17.b, z29.b
+; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z5.h, z7.h, z18.h
+; CHECK-NEXT: zip1 z6.h, z19.h, z16.h
+; CHECK-NEXT: zip1 z7.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z18.b, z25.b, z24.b
+; CHECK-NEXT: zip1 z19.b, z27.b, z26.b
+; CHECK-NEXT: mov z20.b, z2.b[6]
+; CHECK-NEXT: mov z21.b, z2.b[4]
+; CHECK-NEXT: mov z29.b, z17.b[3]
+; CHECK-NEXT: mov z30.b, z17.b[1]
+; CHECK-NEXT: mov z31.b, z2.b[15]
+; CHECK-NEXT: mov z8.b, z2.b[13]
+; CHECK-NEXT: zip1 z16.h, z23.h, z22.h
+; CHECK-NEXT: mov z22.b, z2.b[2]
+; CHECK-NEXT: mov z23.b, z17.b[15]
+; CHECK-NEXT: mov z24.b, z17.b[13]
+; CHECK-NEXT: mov z25.b, z17.b[11]
+; CHECK-NEXT: mov z26.b, z17.b[9]
+; CHECK-NEXT: mov z27.b, z17.b[7]
+; CHECK-NEXT: mov z28.b, z17.b[5]
+; CHECK-NEXT: zip1 z17.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z21.b, z21.b, z20.b
+; CHECK-NEXT: zip1 z19.b, z30.b, z29.b
+; CHECK-NEXT: zip1 z20.b, z8.b, z31.b
+; CHECK-NEXT: mov z29.b, z1.b[15]
+; CHECK-NEXT: mov z30.b, z1.b[13]
+; CHECK-NEXT: mov z31.b, z1.b[11]
+; CHECK-NEXT: mov z8.b, z1.b[9]
+; CHECK-NEXT: zip1 z22.b, z2.b, z22.b
; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
-; CHECK-NEXT: mov z24.b, z1.b[11]
-; CHECK-NEXT: mov z9.b, z1.b[5]
-; CHECK-NEXT: zip1 z19.b, z26.b, z25.b
-; CHECK-NEXT: zip1 z25.b, z28.b, z27.b
-; CHECK-NEXT: zip1 z26.b, z30.b, z29.b
-; CHECK-NEXT: mov z27.b, z2.b[11]
-; CHECK-NEXT: mov z28.b, z2.b[9]
-; CHECK-NEXT: mov z29.b, z2.b[7]
-; CHECK-NEXT: mov z30.b, z2.b[5]
-; CHECK-NEXT: mov z10.b, z0.b[11]
-; CHECK-NEXT: mov z11.b, z0.b[9]
-; CHECK-NEXT: mov z12.b, z0.b[3]
-; CHECK-NEXT: mov z13.b, z0.b[1]
+; CHECK-NEXT: zip1 z24.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z18.b, z28.b, z27.b
+; CHECK-NEXT: mov z25.b, z2.b[11]
+; CHECK-NEXT: mov z26.b, z2.b[9]
+; CHECK-NEXT: mov z27.b, z2.b[7]
+; CHECK-NEXT: mov z28.b, z2.b[5]
+; CHECK-NEXT: mov z9.b, z1.b[7]
+; CHECK-NEXT: mov z10.b, z1.b[5]
; CHECK-NEXT: mov z1.b, z1.b[3]
-; CHECK-NEXT: mov z14.b, z0.b[13]
-; CHECK-NEXT: mov z0.b, z0.b[5]
-; CHECK-NEXT: zip1 z24.b, z31.b, z24.b
-; CHECK-NEXT: mov z31.b, z2.b[3]
+; CHECK-NEXT: mov z11.b, z0.b[11]
+; CHECK-NEXT: mov z12.b, z0.b[9]
+; CHECK-NEXT: zip1 z29.b, z30.b, z29.b
+; CHECK-NEXT: mov z30.b, z0.b[3]
+; CHECK-NEXT: mov z13.b, z0.b[1]
+; CHECK-NEXT: zip1 z31.b, z8.b, z31.b
+; CHECK-NEXT: mov z8.b, z2.b[3]
; CHECK-NEXT: mov z2.b, z2.b[1]
-; CHECK-NEXT: zip1 z8.b, z9.b, z8.b
-; CHECK-NEXT: zip1 z9.b, z11.b, z10.b
-; CHECK-NEXT: zip1 z10.b, z13.b, z12.b
-; CHECK-NEXT: zip1 z27.b, z28.b, z27.b
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z9.b, z10.b, z9.b
+; CHECK-NEXT: zip1 z10.b, z12.b, z11.b
; CHECK-NEXT: zip1 z1.b, z0.b, z1.b
-; CHECK-NEXT: zip1 z11.b, z14.b, z0.b
-; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
-; CHECK-NEXT: zip1 z28.b, z30.b, z29.b
-; CHECK-NEXT: zip1 z2.b, z2.b, z31.b
-; CHECK-NEXT: zip1 z18.h, z20.h, z18.h
-; CHECK-NEXT: zip1 z20.h, z22.h, z21.h
-; CHECK-NEXT: zip1 z21.h, z24.h, z23.h
-; CHECK-NEXT: zip1 z1.h, z1.h, z8.h
-; CHECK-NEXT: zip1 z19.h, z25.h, z19.h
-; CHECK-NEXT: zip1 z22.h, z9.h, z11.h
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: zip1 z0.h, z10.h, z0.h
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: zip1 z23.h, z27.h, z26.h
-; CHECK-NEXT: zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT: zip1 z30.b, z13.b, z30.b
+; CHECK-NEXT: mov z11.b, z0.b[13]
+; CHECK-NEXT: mov z0.b, z0.b[5]
+; CHECK-NEXT: zip1 z25.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z26.b, z28.b, z27.b
+; CHECK-NEXT: zip1 z2.b, z2.b, z8.b
+; CHECK-NEXT: zip1 z21.h, z22.h, z21.h
+; CHECK-NEXT: zip1 z22.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z23.h, z31.h, z29.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z9.h
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z24.h, z10.h, z11.h
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z0.h, z30.h, z0.h
+; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z19.h, z25.h, z20.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z26.h
; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
-; CHECK-NEXT: zip1 z4.s, z7.s, z5.s
-; CHECK-NEXT: zip1 z5.s, z17.s, z16.s
-; CHECK-NEXT: zip1 z1.s, z1.s, z21.s
-; CHECK-NEXT: zip1 z0.s, z0.s, z22.s
-; CHECK-NEXT: zip1 z6.s, z18.s, z6.s
-; CHECK-NEXT: zip1 z7.s, z19.s, z20.s
-; CHECK-NEXT: zip1 z2.s, z2.s, z23.s
+; CHECK-NEXT: zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT: zip1 z5.s, z16.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z23.s
+; CHECK-NEXT: zip1 z6.s, z21.s, z17.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z24.s
+; CHECK-NEXT: zip1 z7.s, z18.s, z22.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z19.s
; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
; CHECK-NEXT: zip1 z1.d, z6.d, z5.d
@@ -1569,7 +1564,7 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-NEXT: add z0.b, z3.b, z0.b
; CHECK-NEXT: add z1.b, z1.b, z2.b
; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1909,29 +1904,26 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
define void @uzp_v8f32(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q6, q0, [x0]
; CHECK-NEXT: adrp x8, .LCPI21_0
-; CHECK-NEXT: ldp q2, q3, [x1]
+; CHECK-NEXT: ldp q1, q2, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: mov z4.s, z0.s[2]
-; CHECK-NEXT: mov z5.s, z0.s[3]
-; CHECK-NEXT: mov z6.s, z0.s[1]
-; CHECK-NEXT: mov z7.s, z1.s[1]
-; CHECK-NEXT: mov z16.s, z3.s[2]
-; CHECK-NEXT: mov z17.s, z2.s[2]
-; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
-; CHECK-NEXT: zip1 z4.s, z6.s, z5.s
-; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT: zip1 z3.s, z3.s, z16.s
-; CHECK-NEXT: tbl z2.s, { z2.s }, z6.s
-; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
-; CHECK-NEXT: zip1 z5.s, z7.s, z0.s
-; CHECK-NEXT: zip1 z7.s, z0.s, z17.s
-; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT: zip1 z1.d, z5.d, z4.d
-; CHECK-NEXT: zip1 z3.d, z7.d, z3.d
-; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: mov z3.s, z0.s[2]
+; CHECK-NEXT: mov z4.s, z0.s[3]
+; CHECK-NEXT: mov z5.s, z0.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[2]
+; CHECK-NEXT: mov z16.s, z1.s[2]
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: mov z4.s, z6.s[1]
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: zip1 z7.s, z0.s, z16.s
+; CHECK-NEXT: tbl z1.s, { z1.s }, z5.s
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s
; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
>From c7a6b91b705d688f772fff7dac4ff3153576d4d1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 10 Oct 2024 13:00:32 +0000
Subject: [PATCH 3/3] Fixups and limit int types
---
.../Target/AArch64/AArch64ISelLowering.cpp | 45 +++---
...treaming-mode-fixed-length-build-vector.ll | 29 ++--
...streaming-mode-fixed-length-masked-load.ll | 134 +++++++----------
...treaming-mode-fixed-length-masked-store.ll | 140 +++++++-----------
4 files changed, 149 insertions(+), 199 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 71115705407bd6..bb2a7587849c59 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14399,39 +14399,44 @@ SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
return convertFromScalableVector(DAG, VT, Seq);
}
+ unsigned NumElems = VT.getVectorNumElements();
if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
- VT.getVectorNumElements() <= 1 || BVN->isConstant())
+ NumElems <= 1 || BVN->isConstant())
+ return SDValue();
+
+ auto IsExtractElt = [](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+ };
+
+ // For integer types that are not already in vectors limit to at most four
+ // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
+ if (VT.getScalarType().isInteger() &&
+ NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
return SDValue();
// Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
- EVT ZipVT = ContainerVT;
SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
- SmallVector<SDValue, 16> Intermediates =
- llvm::map_to_vector<16>(Op->op_values(), [&](SDValue Op) {
- SDValue Undef = DAG.getUNDEF(ZipVT);
+ SmallVector<SDValue, 16> Intermediates = llvm::map_to_vector<16>(
+ Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
return Op.isUndef() ? Undef
- : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ZipVT,
- Undef, Op, ZeroI64);
+ : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+ ContainerVT, Undef, Op, ZeroI64);
});
+ ElementCount ZipEC = ContainerVT.getVectorElementCount();
while (Intermediates.size() > 1) {
- auto ToZipVT = [&](SDValue Op) { return DAG.getBitcast(ZipVT, Op); };
+ EVT ZipVT = getPackedSVEVectorVT(ZipEC);
+
for (unsigned I = 0; I < Intermediates.size(); I += 2) {
- SDValue Op0 = Intermediates[I + 0];
- SDValue Op1 = Intermediates[I + 1];
- Intermediates[I / 2] = Op1.isUndef()
- ? Op0
- : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT,
- ToZipVT(Op0), ToZipVT(Op1));
+ SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
+ SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
+ Intermediates[I / 2] =
+ Op1.isUndef() ? Op0
+ : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
}
Intermediates.resize(Intermediates.size() / 2);
- if (Intermediates.size() > 1) {
- // Prefer FP values to keep elements within vector registers (and also as
- // f16 is conveniently a legal type).
- ZipVT = getPackedSVEVectorVT(EVT::getFloatingPointVT(
- ZipVT.getVectorElementType().getSizeInBits() * 2));
- }
+ ZipEC = ZipEC.divideCoefficientBy(2);
}
assert(Intermediates.size() == 1);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 9166dcbf62c4ef..9729a1d95cd916 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -430,23 +430,20 @@ define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
; CHECK-LABEL: build_vector_non_const_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w7
-; CHECK-NEXT: fmov s1, w6
-; CHECK-NEXT: ldr x8, [sp]
-; CHECK-NEXT: fmov s2, w4
-; CHECK-NEXT: fmov s3, w3
-; CHECK-NEXT: fmov s4, w2
-; CHECK-NEXT: fmov s5, w1
-; CHECK-NEXT: fmov s6, w0
-; CHECK-NEXT: zip1 z0.b, z1.b, z0.b
-; CHECK-NEXT: fmov s1, w5
-; CHECK-NEXT: zip1 z1.b, z2.b, z1.b
-; CHECK-NEXT: zip1 z2.b, z4.b, z3.b
-; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
-; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
-; CHECK-NEXT: zip1 z1.h, z3.h, z2.h
-; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: strb w7, [sp, #15]
+; CHECK-NEXT: ldr x8, [sp, #16]
+; CHECK-NEXT: strb w6, [sp, #14]
+; CHECK-NEXT: strb w5, [sp, #13]
+; CHECK-NEXT: strb w4, [sp, #12]
+; CHECK-NEXT: strb w3, [sp, #11]
+; CHECK-NEXT: strb w2, [sp, #10]
+; CHECK-NEXT: strb w1, [sp, #9]
+; CHECK-NEXT: strb w0, [sp, #8]
+; CHECK-NEXT: ldr d0, [sp, #8]
; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 4980ee4d7f74b7..9055b2efba3282 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -676,105 +676,79 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; CHECK-LABEL: masked_load_v32i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: ldr w8, [sp, #224]
+; CHECK-NEXT: ldr w9, [sp, #216]
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: strb w7, [sp, #6]
+; CHECK-NEXT: strb w8, [sp, #31]
+; CHECK-NEXT: ldr w8, [sp, #208]
+; CHECK-NEXT: strb w9, [sp, #30]
+; CHECK-NEXT: ldr w9, [sp, #200]
+; CHECK-NEXT: strb w8, [sp, #29]
; CHECK-NEXT: ldr w8, [sp, #192]
+; CHECK-NEXT: strb w9, [sp, #28]
; CHECK-NEXT: ldr w9, [sp, #184]
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: ldr w10, [sp, #160]
-; CHECK-NEXT: ldr w11, [sp, #144]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: strb w8, [sp, #27]
; CHECK-NEXT: ldr w8, [sp, #176]
-; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: strb w9, [sp, #26]
; CHECK-NEXT: ldr w9, [sp, #168]
-; CHECK-NEXT: fmov s3, w10
-; CHECK-NEXT: fmov s4, w11
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: ldr w8, [sp, #152]
-; CHECK-NEXT: ldr w10, [sp, #112]
-; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: strb w8, [sp, #25]
+; CHECK-NEXT: ldr w8, [sp, #160]
+; CHECK-NEXT: strb w9, [sp, #24]
+; CHECK-NEXT: ldr w9, [sp, #152]
+; CHECK-NEXT: strb w8, [sp, #23]
+; CHECK-NEXT: ldr w8, [sp, #144]
+; CHECK-NEXT: strb w9, [sp, #22]
; CHECK-NEXT: ldr w9, [sp, #136]
-; CHECK-NEXT: ldr w11, [sp, #96]
-; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: strb w8, [sp, #21]
; CHECK-NEXT: ldr w8, [sp, #128]
-; CHECK-NEXT: zip1 z0.b, z1.b, z0.b
-; CHECK-NEXT: fmov s7, w9
+; CHECK-NEXT: strb w9, [sp, #20]
; CHECK-NEXT: ldr w9, [sp, #120]
-; CHECK-NEXT: fmov s18, w10
-; CHECK-NEXT: fmov s16, w8
-; CHECK-NEXT: ldr w8, [sp, #104]
-; CHECK-NEXT: zip1 z2.b, z6.b, z2.b
-; CHECK-NEXT: fmov s17, w9
+; CHECK-NEXT: strb w8, [sp, #19]
+; CHECK-NEXT: ldr w8, [sp, #112]
+; CHECK-NEXT: strb w9, [sp, #18]
+; CHECK-NEXT: ldr w9, [sp, #104]
+; CHECK-NEXT: strb w8, [sp, #17]
+; CHECK-NEXT: ldr w8, [sp, #96]
+; CHECK-NEXT: strb w9, [sp, #16]
; CHECK-NEXT: ldr w9, [sp, #88]
-; CHECK-NEXT: fmov s20, w11
-; CHECK-NEXT: fmov s19, w8
+; CHECK-NEXT: strb w8, [sp, #15]
; CHECK-NEXT: ldr w8, [sp, #80]
-; CHECK-NEXT: ldr w10, [sp, #64]
-; CHECK-NEXT: fmov s21, w9
+; CHECK-NEXT: strb w9, [sp, #14]
; CHECK-NEXT: ldr w9, [sp, #72]
-; CHECK-NEXT: ldr w11, [sp, #48]
-; CHECK-NEXT: fmov s22, w8
-; CHECK-NEXT: ldr w8, [sp, #56]
-; CHECK-NEXT: zip1 z3.b, z5.b, z3.b
-; CHECK-NEXT: fmov s23, w9
+; CHECK-NEXT: strb w8, [sp, #13]
+; CHECK-NEXT: ldr w8, [sp, #64]
+; CHECK-NEXT: strb w9, [sp, #12]
+; CHECK-NEXT: ldr w9, [sp, #56]
+; CHECK-NEXT: strb w8, [sp, #11]
+; CHECK-NEXT: ldr w8, [sp, #48]
+; CHECK-NEXT: strb w9, [sp, #10]
; CHECK-NEXT: ldr w9, [sp, #40]
-; CHECK-NEXT: zip1 z4.b, z7.b, z4.b
-; CHECK-NEXT: fmov s25, w8
+; CHECK-NEXT: strb w8, [sp, #9]
; CHECK-NEXT: ldr w8, [sp, #32]
-; CHECK-NEXT: fmov s24, w10
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: ldr w9, [sp, #24]
-; CHECK-NEXT: fmov s26, w11
-; CHECK-NEXT: fmov s6, w8
-; CHECK-NEXT: ldr w8, [sp, #16]
-; CHECK-NEXT: zip1 z16.b, z17.b, z16.b
-; CHECK-NEXT: fmov s5, w9
-; CHECK-NEXT: ldr w9, [sp, #8]
-; CHECK-NEXT: zip1 z17.b, z19.b, z18.b
-; CHECK-NEXT: fmov s7, w8
-; CHECK-NEXT: ldr w8, [sp]
-; CHECK-NEXT: zip1 z19.b, z21.b, z20.b
-; CHECK-NEXT: fmov s18, w9
-; CHECK-NEXT: zip1 z20.b, z23.b, z22.b
-; CHECK-NEXT: fmov s23, w7
-; CHECK-NEXT: fmov s22, w8
-; CHECK-NEXT: zip1 z21.b, z25.b, z24.b
-; CHECK-NEXT: zip1 z1.b, z1.b, z26.b
-; CHECK-NEXT: zip1 z5.b, z5.b, z6.b
-; CHECK-NEXT: fmov s24, w3
-; CHECK-NEXT: fmov s25, w2
-; CHECK-NEXT: zip1 z6.b, z18.b, z7.b
-; CHECK-NEXT: fmov s18, w6
-; CHECK-NEXT: fmov s26, w1
-; CHECK-NEXT: zip1 z7.b, z23.b, z22.b
-; CHECK-NEXT: fmov s22, w5
-; CHECK-NEXT: fmov s23, w4
-; CHECK-NEXT: zip1 z0.h, z2.h, z0.h
-; CHECK-NEXT: zip1 z2.h, z4.h, z3.h
-; CHECK-NEXT: zip1 z3.h, z17.h, z16.h
-; CHECK-NEXT: zip1 z4.h, z20.h, z19.h
-; CHECK-NEXT: zip1 z1.h, z1.h, z21.h
-; CHECK-NEXT: zip1 z5.h, z6.h, z5.h
-; CHECK-NEXT: zip1 z18.b, z22.b, z18.b
-; CHECK-NEXT: zip1 z22.b, z24.b, z23.b
+; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: strb w8, [sp, #7]
; CHECK-NEXT: mov w8, #16 // =0x10
-; CHECK-NEXT: zip1 z23.b, z26.b, z25.b
-; CHECK-NEXT: zip1 z0.s, z2.s, z0.s
-; CHECK-NEXT: zip1 z2.s, z4.s, z3.s
-; CHECK-NEXT: zip1 z1.s, z5.s, z1.s
-; CHECK-NEXT: zip1 z6.h, z18.h, z7.h
-; CHECK-NEXT: zip1 z7.h, z23.h, z22.h
-; CHECK-NEXT: zip1 z0.d, z2.d, z0.d
-; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT: strb w6, [sp, #5]
+; CHECK-NEXT: strb w5, [sp, #4]
+; CHECK-NEXT: strb w4, [sp, #3]
+; CHECK-NEXT: strb w3, [sp, #2]
+; CHECK-NEXT: strb w2, [sp, #1]
+; CHECK-NEXT: strb w1, [sp]
+; CHECK-NEXT: ldp q1, q0, [sp]
; CHECK-NEXT: lsl z0.b, z0.b, #7
-; CHECK-NEXT: zip1 z1.d, z3.d, z1.d
-; CHECK-NEXT: asr z0.b, z0.b, #7
; CHECK-NEXT: lsl z1.b, z1.b, #7
-; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: asr z0.b, z0.b, #7
; CHECK-NEXT: asr z1.b, z1.b, #7
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x8]
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x8]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 2966ab12b8cad6..265480b571970f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -293,104 +293,78 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
; CHECK-LABEL: masked_store_v32i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: ldr w8, [sp, #96]
+; CHECK-NEXT: ldr w9, [sp, #88]
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: ldr w10, [sp, #120]
+; CHECK-NEXT: strb w7, [sp, #6]
+; CHECK-NEXT: strb w8, [sp, #15]
+; CHECK-NEXT: ldr w8, [sp, #80]
+; CHECK-NEXT: strb w9, [sp, #14]
+; CHECK-NEXT: ldr w9, [sp, #72]
+; CHECK-NEXT: strb w8, [sp, #13]
; CHECK-NEXT: ldr w8, [sp, #64]
+; CHECK-NEXT: strb w9, [sp, #12]
; CHECK-NEXT: ldr w9, [sp, #56]
-; CHECK-NEXT: fmov s26, w2
-; CHECK-NEXT: ldr w10, [sp, #32]
-; CHECK-NEXT: ldr w11, [sp, #16]
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: strb w8, [sp, #11]
; CHECK-NEXT: ldr w8, [sp, #48]
-; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: strb w9, [sp, #10]
; CHECK-NEXT: ldr w9, [sp, #40]
-; CHECK-NEXT: fmov s5, w10
-; CHECK-NEXT: fmov s7, w11
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: ldr w8, [sp, #24]
-; CHECK-NEXT: ldr w10, [sp, #176]
-; CHECK-NEXT: fmov s3, w9
-; CHECK-NEXT: ldr w9, [sp, #8]
-; CHECK-NEXT: ldr w11, [sp, #168]
-; CHECK-NEXT: fmov s6, w8
-; CHECK-NEXT: ldr w8, [sp]
-; CHECK-NEXT: fmov s19, w10
-; CHECK-NEXT: fmov s16, w9
+; CHECK-NEXT: strb w8, [sp, #9]
+; CHECK-NEXT: ldr w8, [sp, #32]
+; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: ldr w9, [sp, #216]
+; CHECK-NEXT: strb w8, [sp, #7]
+; CHECK-NEXT: ldr w8, [sp, #224]
+; CHECK-NEXT: strb w9, [sp, #30]
+; CHECK-NEXT: ldr w9, [sp, #200]
+; CHECK-NEXT: strb w8, [sp, #31]
+; CHECK-NEXT: ldr w8, [sp, #208]
+; CHECK-NEXT: strb w9, [sp, #28]
; CHECK-NEXT: ldr w9, [sp, #184]
-; CHECK-NEXT: fmov s20, w11
-; CHECK-NEXT: zip1 z4.b, z3.b, z2.b
-; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: strb w8, [sp, #29]
; CHECK-NEXT: ldr w8, [sp, #192]
-; CHECK-NEXT: fmov s18, w9
+; CHECK-NEXT: strb w9, [sp, #26]
+; CHECK-NEXT: ldr w9, [sp, #168]
+; CHECK-NEXT: strb w8, [sp, #27]
+; CHECK-NEXT: ldr w8, [sp, #176]
+; CHECK-NEXT: strb w9, [sp, #24]
; CHECK-NEXT: ldr w9, [sp, #152]
-; CHECK-NEXT: ldr w10, [sp, #136]
-; CHECK-NEXT: fmov s17, w8
+; CHECK-NEXT: strb w8, [sp, #25]
; CHECK-NEXT: ldr w8, [sp, #160]
-; CHECK-NEXT: ldr w11, [sp, #120]
-; CHECK-NEXT: fmov s21, w10
-; CHECK-NEXT: ldr w10, [sp, #88]
-; CHECK-NEXT: zip1 z1.b, z1.b, z0.b
-; CHECK-NEXT: fmov s23, w11
-; CHECK-NEXT: ldr w11, [sp, #72]
-; CHECK-NEXT: zip1 z0.b, z6.b, z5.b
-; CHECK-NEXT: zip1 z17.b, z18.b, z17.b
-; CHECK-NEXT: zip1 z18.b, z20.b, z19.b
-; CHECK-NEXT: fmov s19, w8
-; CHECK-NEXT: fmov s20, w9
+; CHECK-NEXT: strb w9, [sp, #22]
+; CHECK-NEXT: ldr w9, [sp, #136]
+; CHECK-NEXT: strb w8, [sp, #23]
; CHECK-NEXT: ldr w8, [sp, #144]
-; CHECK-NEXT: ldr w9, [sp, #128]
-; CHECK-NEXT: fmov s24, w10
-; CHECK-NEXT: fmov s5, w7
-; CHECK-NEXT: fmov s25, w11
-; CHECK-NEXT: fmov s22, w9
-; CHECK-NEXT: ldr w9, [sp, #104]
-; CHECK-NEXT: zip1 z2.b, z16.b, z7.b
-; CHECK-NEXT: zip1 z19.b, z20.b, z19.b
-; CHECK-NEXT: fmov s20, w8
-; CHECK-NEXT: ldr w8, [sp, #112]
-; CHECK-NEXT: zip1 z3.b, z5.b, z3.b
-; CHECK-NEXT: fmov s5, w6
-; CHECK-NEXT: fmov s6, w5
-; CHECK-NEXT: fmov s7, w4
-; CHECK-NEXT: fmov s16, w3
-; CHECK-NEXT: zip1 z1.h, z4.h, z1.h
-; CHECK-NEXT: zip1 z20.b, z21.b, z20.b
-; CHECK-NEXT: zip1 z21.b, z23.b, z22.b
-; CHECK-NEXT: fmov s22, w8
-; CHECK-NEXT: fmov s23, w9
-; CHECK-NEXT: ldr w8, [sp, #96]
-; CHECK-NEXT: ldr w9, [sp, #80]
-; CHECK-NEXT: zip1 z5.b, z6.b, z5.b
-; CHECK-NEXT: zip1 z6.b, z16.b, z7.b
-; CHECK-NEXT: zip1 z4.h, z18.h, z17.h
-; CHECK-NEXT: zip1 z16.h, z20.h, z19.h
-; CHECK-NEXT: zip1 z0.h, z2.h, z0.h
-; CHECK-NEXT: zip1 z22.b, z23.b, z22.b
-; CHECK-NEXT: fmov s23, w8
+; CHECK-NEXT: strb w9, [sp, #20]
+; CHECK-NEXT: ldr w9, [sp, #112]
+; CHECK-NEXT: strb w8, [sp, #21]
+; CHECK-NEXT: ldr w8, [sp, #128]
+; CHECK-NEXT: strb w6, [sp, #5]
+; CHECK-NEXT: strb w8, [sp, #19]
+; CHECK-NEXT: ldr w8, [sp, #104]
+; CHECK-NEXT: strb w5, [sp, #4]
+; CHECK-NEXT: strb w4, [sp, #3]
+; CHECK-NEXT: strb w3, [sp, #2]
+; CHECK-NEXT: strb w2, [sp, #1]
+; CHECK-NEXT: strb w1, [sp]
+; CHECK-NEXT: strb w10, [sp, #18]
+; CHECK-NEXT: strb w9, [sp, #17]
+; CHECK-NEXT: strb w8, [sp, #16]
; CHECK-NEXT: mov w8, #16 // =0x10
-; CHECK-NEXT: zip1 z2.h, z5.h, z3.h
-; CHECK-NEXT: zip1 z4.s, z16.s, z4.s
-; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
-; CHECK-NEXT: fmov s24, w9
-; CHECK-NEXT: zip1 z17.h, z22.h, z21.h
-; CHECK-NEXT: zip1 z24.b, z25.b, z24.b
-; CHECK-NEXT: fmov s25, w1
-; CHECK-NEXT: zip1 z7.b, z25.b, z26.b
-; CHECK-NEXT: zip1 z18.h, z24.h, z23.h
-; CHECK-NEXT: zip1 z3.h, z7.h, z6.h
-; CHECK-NEXT: zip1 z5.s, z18.s, z17.s
-; CHECK-NEXT: zip1 z1.s, z3.s, z2.s
-; CHECK-NEXT: zip1 z2.d, z5.d, z4.d
-; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT: lsl z1.b, z2.b, #7
+; CHECK-NEXT: ldp q1, q0, [sp]
; CHECK-NEXT: lsl z0.b, z0.b, #7
-; CHECK-NEXT: asr z1.b, z1.b, #7
+; CHECK-NEXT: lsl z1.b, z1.b, #7
; CHECK-NEXT: asr z0.b, z0.b, #7
-; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: asr z1.b, z1.b, #7
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
; CHECK-NEXT: mov z0.b, #0 // =0x0
; CHECK-NEXT: st1b { z0.b }, p1, [x0, x8]
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v32i8:
More information about the llvm-commits
mailing list