[llvm] 5f7502b - [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (#111698)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 18 02:19:26 PDT 2024
Author: Benjamin Maxwell
Date: 2024-10-18T10:19:22+01:00
New Revision: 5f7502bf1f193482e23385cdd4cfecf09f19ccbc
URL: https://github.com/llvm/llvm-project/commit/5f7502bf1f193482e23385cdd4cfecf09f19ccbc
DIFF: https://github.com/llvm/llvm-project/commit/5f7502bf1f193482e23385cdd4cfecf09f19ccbc.diff
LOG: [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (#111698)
This allows lowering fixed-length (non-constant) BUILD_VECTORS (<=
128-bit) to a chain of ZIP1 instructions when Neon is not available,
rather than using the default lowering, which is to spill to the stack
and reload.
For example,
```
t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3)
```
Becomes:
```
zip1 z0.s, z0.s, z1.s // z0 = t0,t1,...
zip1 z2.s, z2.s, z3.s // z2 = t2,t3,...
zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,...
```
When values are already in FRPs, this generally seems to lead to a more
compact output with less movement to/from the stack.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a44a73eb2c0fda..d5466e0a1cbd44 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
@@ -2111,7 +2112,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
setOperationAction(ISD::BITREVERSE, VT, Default);
setOperationAction(ISD::BSWAP, VT, Default);
- setOperationAction(ISD::BUILD_VECTOR, VT, Default);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
setOperationAction(ISD::CTLZ, VT, Default);
setOperationAction(ISD::CTPOP, VT, Default);
@@ -14395,24 +14396,72 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ auto *BVN = cast<BuildVectorSDNode>(Op);
- if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
- if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
- SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
- SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
- return convertFromScalableVector(DAG, Op.getValueType(), Seq);
- }
+ if (auto SeqInfo = BVN->isConstantSequence()) {
+ SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
+ SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
+ SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
+ return convertFromScalableVector(DAG, VT, Seq);
+ }
+
+ unsigned NumElems = VT.getVectorNumElements();
+ if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
+ NumElems <= 1 || BVN->isConstant())
+ return SDValue();
+
+ auto IsExtractElt = [](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+ };
- // Revert to common legalisation for all other variants.
+ // For integer types that are not already in vectors limit to at most four
+ // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
+ if (VT.getScalarType().isInteger() &&
+ NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
return SDValue();
+
+ // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
+ SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
+ SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
+ Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
+ return Op.isUndef() ? Undef
+ : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+ ContainerVT, Undef, Op, ZeroI64);
+ });
+
+ ElementCount ZipEC = ContainerVT.getVectorElementCount();
+ while (Intermediates.size() > 1) {
+ EVT ZipVT = getPackedSVEVectorVT(ZipEC);
+
+ for (unsigned I = 0; I < Intermediates.size(); I += 2) {
+ SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
+ SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
+ Intermediates[I / 2] =
+ Op1.isUndef() ? Op0
+ : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+ }
+
+ Intermediates.resize(Intermediates.size() / 2);
+ ZipEC = ZipEC.divideCoefficientBy(2);
}
+ assert(Intermediates.size() == 1);
+ SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
+ return convertFromScalableVector(DAG, VT, Vec);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+ return LowerFixedLengthBuildVectorToSVE(Op, DAG);
+
// Try to build a simple constant vector.
Op = NormalizeBuildVector(Op, DAG);
// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 217e971568a999..160cd18ca53b32 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1244,6 +1244,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 276f23703df3df..20659cde83ee00 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w9, s0
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -263,89 +230,59 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w8, s0
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w9, s1
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -401,34 +338,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: mov z1.b, z0.b[7]
-; CHECK-NEXT: mov z2.b, z0.b[6]
-; CHECK-NEXT: mov z3.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strb w9, [sp, #11]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #9]
-; CHECK-NEXT: strb w8, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z4.b, z0.b[1]
+; CHECK-NEXT: mov z1.b, z1.b[1]
+; CHECK-NEXT: mov z5.b, z0.b[7]
+; CHECK-NEXT: mov z6.b, z0.b[6]
+; CHECK-NEXT: mov z0.b, z0.b[4]
+; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index 617b560713c3ab..478072d33d8c9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
; CHECK-LABEL: vls_sve_and_2xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fmov s1, wzr
; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: stp wzr, w8, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index b9264ad5f77c37..6644be11a02ba7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -91,19 +91,12 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
define void @bitcast_v2i16(ptr %a, ptr %b) {
; CHECK-LABEL: bitcast_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: str w8, [x1]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index b8a2e0e0f4bd4c..9729a1d95cd916 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -222,3 +222,255 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
ret void
}
+
+define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr w8, w0, w1, lsl #1
+; CHECK-NEXT: orr w8, w8, w2, lsl #2
+; CHECK-NEXT: orr w8, w8, w3, lsl #3
+; CHECK-NEXT: strb w8, [x4]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: orr w8, w0, w1, lsl #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w2, lsl #2
+; NONEON-NOSVE-NEXT: orr w8, w8, w3, lsl #3
+; NONEON-NOSVE-NEXT: strb w8, [x4]
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+ %2 = insertelement <4 x i1> %1, i1 %b, i64 1
+ %3 = insertelement <4 x i1> %2, i1 %c, i64 2
+ %4 = insertelement <4 x i1> %3, i1 %d, i64 3
+ store <4 x i1> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x double> undef, double %a, i64 0
+ %2 = insertelement <2 x double> %1, double %b, i64 1
+ store <2 x double> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x float> undef, float %a, i64 0
+ %2 = insertelement <2 x float> %1, float %b, i64 1
+ store <2 x float> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s3 killed $s3 def $z3
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s2, s3, [sp, #8]
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x float> undef, float %a, i64 0
+ %2 = insertelement <4 x float> %1, float %b, i64 1
+ %3 = insertelement <4 x float> %2, float %c, i64 2
+ %4 = insertelement <4 x float> %3, float %d, i64 3
+ store <4 x float> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d3 killed $d3 def $z3
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d2, d3, [sp, #16]
+; NONEON-NOSVE-NEXT: ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x double> undef, double %a, i64 0
+ %2 = insertelement <4 x double> %1, double %b, i64 1
+ %3 = insertelement <4 x double> %2, double %c, i64 2
+ %4 = insertelement <4 x double> %3, double %d, i64 3
+ store <4 x double> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h6 killed $h6 def $z6
+; CHECK-NEXT: // kill: def $h4 killed $h4 def $z4
+; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: // kill: def $h7 killed $h7 def $z7
+; CHECK-NEXT: // kill: def $h5 killed $h5 def $z5
+; CHECK-NEXT: // kill: def $h3 killed $h3 def $z3
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $z1
+; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z1.s, z4.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: str h7, [sp, #14]
+; NONEON-NOSVE-NEXT: str h6, [sp, #12]
+; NONEON-NOSVE-NEXT: str h5, [sp, #10]
+; NONEON-NOSVE-NEXT: str h4, [sp, #8]
+; NONEON-NOSVE-NEXT: str h3, [sp, #6]
+; NONEON-NOSVE-NEXT: str h2, [sp, #4]
+; NONEON-NOSVE-NEXT: str h1, [sp, #2]
+; NONEON-NOSVE-NEXT: str h0, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <8 x half> undef, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %d, i64 3
+ %5 = insertelement <8 x half> %4, half %e, i64 4
+ %6 = insertelement <8 x half> %5, half %f, i64 5
+ %7 = insertelement <8 x half> %6, half %g, i64 6
+ %8 = insertelement <8 x half> %7, half %h, i64 7
+ store <8 x half> %8, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: str d0, [x2]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp w0, w1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x2]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x i32> undef, i32 %a, i64 0
+ %2 = insertelement <2 x i32> %1, i32 %b, i64 1
+ store <2 x i32> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: strb w7, [sp, #15]
+; CHECK-NEXT: ldr x8, [sp, #16]
+; CHECK-NEXT: strb w6, [sp, #14]
+; CHECK-NEXT: strb w5, [sp, #13]
+; CHECK-NEXT: strb w4, [sp, #12]
+; CHECK-NEXT: strb w3, [sp, #11]
+; CHECK-NEXT: strb w2, [sp, #10]
+; CHECK-NEXT: strb w1, [sp, #9]
+; CHECK-NEXT: strb w0, [sp, #8]
+; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: strb w7, [sp, #15]
+; NONEON-NOSVE-NEXT: ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT: strb w6, [sp, #14]
+; NONEON-NOSVE-NEXT: strb w5, [sp, #13]
+; NONEON-NOSVE-NEXT: strb w4, [sp, #12]
+; NONEON-NOSVE-NEXT: strb w3, [sp, #11]
+; NONEON-NOSVE-NEXT: strb w2, [sp, #10]
+; NONEON-NOSVE-NEXT: strb w1, [sp, #9]
+; NONEON-NOSVE-NEXT: strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x8]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <8 x i8> undef, i8 %a, i64 0
+ %2 = insertelement <8 x i8> %1, i8 %b, i64 1
+ %3 = insertelement <8 x i8> %2, i8 %c, i64 2
+ %4 = insertelement <8 x i8> %3, i8 %d, i64 3
+ %5 = insertelement <8 x i8> %4, i8 %e, i64 4
+ %6 = insertelement <8 x i8> %5, i8 %f, i64 5
+ %7 = insertelement <8 x i8> %6, i8 %g, i64 6
+ %8 = insertelement <8 x i8> %7, i8 %h, i64 7
+ store <8 x i8> %8, ptr %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 4b6285b2732fe5..c1810c678ea522 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu"
define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-LABEL: concat_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: mov z2.h, z1.h[3]
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: mov z2.h, z1.h[3]
; CHECK-NEXT: mov z3.h, z1.h[2]
-; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z0.h[3]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strb w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z5.h, z0.h[3]
+; CHECK-NEXT: mov z6.h, z0.h[2]
+; CHECK-NEXT: mov z7.h, z0.h[1]
+; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z7.b
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v8i8:
@@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-LABEL: concat_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.s, z1.s[1]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4i16:
@@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
; CHECK-LABEL: concat_v4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: str h1, [sp, #12]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h2, [sp, #14]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 50a05cb4b1e277..7d6336a43a4fd1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
; CHECK-LABEL: load_sext_v2i64i256:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: mov z1.d, z0.d[1]
-; CHECK-NEXT: asr x9, x8, #63
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: asr x8, x10, #63
-; CHECK-NEXT: mov z0.d, x9
-; CHECK-NEXT: stp x10, x8, [sp, #16]
-; CHECK-NEXT: mov z1.d, x8
-; CHECK-NEXT: ldp q2, q4, [sp], #32
-; CHECK-NEXT: mov z3.d, z0.d[1]
-; CHECK-NEXT: mov z5.d, z1.d[1]
-; CHECK-NEXT: mov z6.d, z2.d[1]
-; CHECK-NEXT: fmov x2, d0
-; CHECK-NEXT: mov z0.d, z4.d[1]
-; CHECK-NEXT: fmov x6, d1
-; CHECK-NEXT: fmov x0, d2
-; CHECK-NEXT: fmov x4, d4
-; CHECK-NEXT: fmov x3, d3
-; CHECK-NEXT: fmov x7, d5
-; CHECK-NEXT: fmov x1, d6
-; CHECK-NEXT: fmov x5, d0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: asr x8, x8, #63
+; CHECK-NEXT: fmov d3, x8
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: asr x9, x9, #63
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT: mov z3.d, x9
+; CHECK-NEXT: fmov x2, d2
+; CHECK-NEXT: zip1 z1.d, z1.d, z4.d
+; CHECK-NEXT: mov z4.d, z2.d[1]
+; CHECK-NEXT: mov z5.d, z0.d[1]
+; CHECK-NEXT: mov z6.d, z3.d[1]
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fmov x6, d3
+; CHECK-NEXT: mov z2.d, z1.d[1]
+; CHECK-NEXT: fmov x3, d4
+; CHECK-NEXT: fmov x1, d5
+; CHECK-NEXT: fmov x4, d1
+; CHECK-NEXT: fmov x7, d6
+; CHECK-NEXT: fmov x5, d2
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 2665696308463f..a728cbe97056db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
; CHECK-LABEL: extract_subvector_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
@@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
; CHECK-LABEL: extract_subvector_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index dad53b31db0b0f..f1771a753826cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
; SVE-LABEL: test_copysign_v4f16_v4f64:
; SVE: // %bb.0:
-; SVE-NEXT: sub sp, sp, #16
-; SVE-NEXT: .cfi_def_cfa_offset 16
-; SVE-NEXT: ldp q1, q0, [x1]
-; SVE-NEXT: ldr d4, [x0]
-; SVE-NEXT: and z4.h, z4.h, #0x7fff
-; SVE-NEXT: mov z2.d, z0.d[1]
-; SVE-NEXT: mov z3.d, z1.d[1]
-; SVE-NEXT: fcvt h0, d0
+; SVE-NEXT: ldp q0, q1, [x1]
+; SVE-NEXT: mov z2.d, z1.d[1]
+; SVE-NEXT: mov z3.d, z0.d[1]
; SVE-NEXT: fcvt h1, d1
+; SVE-NEXT: fcvt h0, d0
; SVE-NEXT: fcvt h2, d2
; SVE-NEXT: fcvt h3, d3
-; SVE-NEXT: str h0, [sp, #12]
-; SVE-NEXT: str h1, [sp, #8]
-; SVE-NEXT: str h2, [sp, #14]
-; SVE-NEXT: str h3, [sp, #10]
-; SVE-NEXT: ldr d0, [sp, #8]
+; SVE-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE-NEXT: zip1 z0.s, z0.s, z1.s
+; SVE-NEXT: ldr d1, [x0]
+; SVE-NEXT: and z1.h, z1.h, #0x7fff
; SVE-NEXT: and z0.h, z0.h, #0x8000
-; SVE-NEXT: orr z0.d, z4.d, z0.d
+; SVE-NEXT: orr z0.d, z1.d, z0.d
; SVE-NEXT: str d0, [x0]
-; SVE-NEXT: add sp, sp, #16
; SVE-NEXT: ret
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
-; SVE2-NEXT: sub sp, sp, #16
-; SVE2-NEXT: .cfi_def_cfa_offset 16
-; SVE2-NEXT: ldp q2, q1, [x1]
-; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT: ldr d5, [x0]
-; SVE2-NEXT: mov z3.d, z1.d[1]
-; SVE2-NEXT: mov z4.d, z2.d[1]
+; SVE2-NEXT: ldp q0, q1, [x1]
+; SVE2-NEXT: mov z2.d, z1.d[1]
+; SVE2-NEXT: mov z3.d, z0.d[1]
; SVE2-NEXT: fcvt h1, d1
+; SVE2-NEXT: fcvt h0, d0
; SVE2-NEXT: fcvt h2, d2
; SVE2-NEXT: fcvt h3, d3
-; SVE2-NEXT: fcvt h4, d4
-; SVE2-NEXT: str h1, [sp, #12]
-; SVE2-NEXT: str h2, [sp, #8]
-; SVE2-NEXT: str h3, [sp, #14]
-; SVE2-NEXT: str h4, [sp, #10]
-; SVE2-NEXT: ldr d1, [sp, #8]
-; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d
-; SVE2-NEXT: str d5, [x0]
-; SVE2-NEXT: add sp, sp, #16
+; SVE2-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2-NEXT: mov z2.h, #32767 // =0x7fff
+; SVE2-NEXT: zip1 z0.s, z0.s, z1.s
+; SVE2-NEXT: ldr d1, [x0]
+; SVE2-NEXT: bsl z1.d, z1.d, z0.d, z2.d
+; SVE2-NEXT: str d1, [x0]
; SVE2-NEXT: ret
;
; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index a206fbc5102953..11fee267660c03 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: fcvtzu x8, h0
; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: stp x8, x9, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
@@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v4f16_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fcvtzu x8, h0
-; CHECK-NEXT: mov z2.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: fcvtzu x10, h2
-; CHECK-NEXT: fcvtzu x11, h0
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: mov z1.h, z0.h[3]
+; CHECK-NEXT: mov z2.h, z0.h[2]
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: fcvtzu x10, h0
+; CHECK-NEXT: fcvtzu x8, h1
+; CHECK-NEXT: fcvtzu x9, h2
+; CHECK-NEXT: fcvtzu x11, h3
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x11
+; CHECK-NEXT: zip1 z1.d, z2.d, z1.d
; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: fcvtzu x12, h0
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z3.h, z0.h[2]
+; CHECK-NEXT: mov z4.h, z0.h[1]
+; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: fcvtzu x8, h1
-; CHECK-NEXT: mov z3.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: fcvtzu x9, h2
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: fcvtzu x10, h3
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzu x11, h1
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-64]!
-; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: fcvtzu x8, h2
; CHECK-NEXT: fcvtzu x9, h3
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: fcvtzu x10, h0
-; CHECK-NEXT: ldp q2, q3, [sp]
-; CHECK-NEXT: stp x12, x8, [sp, #32]
-; CHECK-NEXT: stp x10, x9, [sp, #48]
-; CHECK-NEXT: ldp q1, q0, [sp, #32]
-; CHECK-NEXT: stp q2, q3, [x1, #32]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: fcvtzu x11, h4
+; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: mov z6.h, z1.h[2]
+; CHECK-NEXT: mov z2.h, z1.h[1]
+; CHECK-NEXT: fcvtzu x14, h1
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: fcvtzu x12, h5
+; CHECK-NEXT: fcvtzu x13, h6
+; CHECK-NEXT: fcvtzu x15, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x12
+; CHECK-NEXT: fmov d4, x13
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: fmov d3, x14
+; CHECK-NEXT: zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT: fmov d4, x15
+; CHECK-NEXT: stp q2, q0, [x1]
+; CHECK-NEXT: zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT: stp q3, q1, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v16f16_v16i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z6.h, z1.h[3]
+; CHECK-NEXT: fcvtzu x9, h1
+; CHECK-NEXT: fcvtzu x8, h0
+; CHECK-NEXT: mov z7.h, z0.h[1]
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT: mov z4.h, z2.h[1]
-; CHECK-NEXT: fcvtzu x8, h2
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[2]
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: fcvtzu x9, h4
-; CHECK-NEXT: mov z4.h, z3.h[1]
-; CHECK-NEXT: fcvtzu x10, h5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: fcvtzu x11, h2
-; CHECK-NEXT: mov z2.h, z3.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-128]!
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: fcvtzu x8, h4
-; CHECK-NEXT: fcvtzu x9, h5
-; CHECK-NEXT: stp x11, x10, [sp, #16]
+; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8
; CHECK-NEXT: fcvtzu x10, h2
-; CHECK-NEXT: mov z3.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z1.h[3]
-; CHECK-NEXT: fcvtzu x11, h1
+; CHECK-NEXT: fcvtzu x11, h4
+; CHECK-NEXT: fcvtzu x12, h6
; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: stp x12, x8, [sp, #64]
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: fcvtzu x8, h4
-; CHECK-NEXT: stp x10, x9, [sp, #80]
-; CHECK-NEXT: fcvtzu x9, h1
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x11, x12, [sp, #32]
-; CHECK-NEXT: fcvtzu x11, h2
-; CHECK-NEXT: fcvtzu x12, h3
-; CHECK-NEXT: stp x9, x8, [sp, #48]
-; CHECK-NEXT: fcvtzu x8, h0
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: ldp q3, q4, [sp, #64]
-; CHECK-NEXT: stp x10, x11, [sp, #96]
-; CHECK-NEXT: ldp q6, q7, [sp, #32]
-; CHECK-NEXT: stp x8, x12, [sp, #112]
-; CHECK-NEXT: ldp q5, q2, [sp, #96]
-; CHECK-NEXT: stp q0, q1, [x1, #32]
-; CHECK-NEXT: stp q6, q7, [x1]
-; CHECK-NEXT: stp q3, q4, [x1, #96]
-; CHECK-NEXT: stp q5, q2, [x1, #64]
-; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: fmov d16, x9
+; CHECK-NEXT: mov z2.h, z3.h[3]
+; CHECK-NEXT: mov z4.h, z5.h[3]
+; CHECK-NEXT: fcvtzu x14, h3
+; CHECK-NEXT: fcvtzu x13, h1
+; CHECK-NEXT: fcvtzu x15, h5
+; CHECK-NEXT: mov z1.h, z3.h[1]
+; CHECK-NEXT: mov z6.h, z5.h[1]
+; CHECK-NEXT: mov z5.h, z5.h[2]
+; CHECK-NEXT: mov z3.h, z3.h[2]
+; CHECK-NEXT: fcvtzu x9, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fcvtzu x10, h4
+; CHECK-NEXT: fmov d4, x11
+; CHECK-NEXT: fcvtzu x11, h7
+; CHECK-NEXT: fmov d7, x12
+; CHECK-NEXT: fcvtzu x12, h0
+; CHECK-NEXT: fmov d0, x13
+; CHECK-NEXT: fcvtzu x13, h1
+; CHECK-NEXT: fmov d1, x14
+; CHECK-NEXT: fcvtzu x14, h6
+; CHECK-NEXT: fmov d6, x15
+; CHECK-NEXT: fcvtzu x15, h5
+; CHECK-NEXT: fmov d5, x9
+; CHECK-NEXT: fcvtzu x9, h3
+; CHECK-NEXT: zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT: fmov d16, x8
+; CHECK-NEXT: zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT: fmov d3, x12
+; CHECK-NEXT: fmov d7, x10
+; CHECK-NEXT: stp q4, q0, [x1, #64]
+; CHECK-NEXT: fmov d0, x14
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT: zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT: fmov d16, x15
+; CHECK-NEXT: stp q3, q2, [x1]
+; CHECK-NEXT: fmov d2, x13
+; CHECK-NEXT: zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT: stp q0, q7, [x1, #96]
+; CHECK-NEXT: stp q1, q4, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
; CHECK-LABEL: fcvtzu_v4f64_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: mov z2.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z1.s[1]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: mov z2.s, z1.s[1]
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
@@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
; CHECK-LABEL: fcvtzu_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldp q1, q0, [x0, #32]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldp q2, q3, [x0]
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.s, z3.s[1]
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT: mov z4.s, z0.s[1]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z3.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
@@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzu_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q0, q1, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x0, #96]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: ldp q4, q5, [x0, #96]
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: ldp q0, q4, [x0, #32]
+; CHECK-NEXT: ldp q2, q7, [x0, #64]
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d
; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT: ldp q6, q7, [x0, #64]
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s
; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z16.s, z1.s[1]
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z0.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z3.s[1]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: movprfx z3, z7
-; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT: mov z3.s, z5.s[1]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: ldp q1, q0, [sp]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: mov z17.s, z6.s[1]
+; CHECK-NEXT: mov z16.s, z4.s[1]
+; CHECK-NEXT: mov z18.s, z5.s[1]
+; CHECK-NEXT: mov z21.s, z0.s[1]
+; CHECK-NEXT: mov z19.s, z7.s[1]
+; CHECK-NEXT: mov z20.s, z2.s[1]
+; CHECK-NEXT: mov z22.s, z3.s[1]
+; CHECK-NEXT: mov z23.s, z1.s[1]
+; CHECK-NEXT: zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT: zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT: zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: stp q0, q2, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
@@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: stp x8, x9, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
@@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v4f16_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fcvtzs x8, h0
-; CHECK-NEXT: mov z2.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: fcvtzs x10, h2
-; CHECK-NEXT: fcvtzs x11, h0
-; CHECK-NEXT: stp x8, x9, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: mov z1.h, z0.h[3]
+; CHECK-NEXT: mov z2.h, z0.h[2]
+; CHECK-NEXT: mov z3.h, z0.h[1]
+; CHECK-NEXT: fcvtzs x10, h0
+; CHECK-NEXT: fcvtzs x8, h1
+; CHECK-NEXT: fcvtzs x9, h2
+; CHECK-NEXT: fcvtzs x11, h3
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x11
+; CHECK-NEXT: zip1 z1.d, z2.d, z1.d
; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: fcvtzs x12, h0
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z3.h, z0.h[2]
+; CHECK-NEXT: mov z4.h, z0.h[1]
+; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT: mov z2.h, z1.h[1]
-; CHECK-NEXT: fcvtzs x8, h1
-; CHECK-NEXT: mov z3.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: fcvtzs x9, h2
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: fcvtzs x10, h3
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzs x11, h1
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-64]!
-; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: fcvtzs x8, h2
; CHECK-NEXT: fcvtzs x9, h3
-; CHECK-NEXT: stp x11, x10, [sp, #16]
-; CHECK-NEXT: fcvtzs x10, h0
-; CHECK-NEXT: ldp q2, q3, [sp]
-; CHECK-NEXT: stp x12, x8, [sp, #32]
-; CHECK-NEXT: stp x10, x9, [sp, #48]
-; CHECK-NEXT: ldp q1, q0, [sp, #32]
-; CHECK-NEXT: stp q2, q3, [x1, #32]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: fcvtzs x11, h4
+; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: mov z6.h, z1.h[2]
+; CHECK-NEXT: mov z2.h, z1.h[1]
+; CHECK-NEXT: fcvtzs x14, h1
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: fcvtzs x12, h5
+; CHECK-NEXT: fcvtzs x13, h6
+; CHECK-NEXT: fcvtzs x15, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: fmov d1, x12
+; CHECK-NEXT: fmov d4, x13
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: fmov d3, x14
+; CHECK-NEXT: zip1 z1.d, z4.d, z1.d
+; CHECK-NEXT: fmov d4, x15
+; CHECK-NEXT: stp q2, q0, [x1]
+; CHECK-NEXT: zip1 z3.d, z3.d, z4.d
+; CHECK-NEXT: stp q3, q1, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v16f16_v16i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z2.h, z0.h[3]
+; CHECK-NEXT: mov z4.h, z1.h[1]
+; CHECK-NEXT: mov z6.h, z1.h[3]
+; CHECK-NEXT: fcvtzs x9, h1
+; CHECK-NEXT: fcvtzs x8, h0
+; CHECK-NEXT: mov z7.h, z0.h[1]
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT: mov z4.h, z2.h[1]
-; CHECK-NEXT: fcvtzs x8, h2
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[2]
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: fcvtzs x9, h4
-; CHECK-NEXT: mov z4.h, z3.h[1]
-; CHECK-NEXT: fcvtzs x10, h5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: fcvtzs x11, h2
-; CHECK-NEXT: mov z2.h, z3.h[2]
-; CHECK-NEXT: stp x8, x9, [sp, #-128]!
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: fcvtzs x8, h4
-; CHECK-NEXT: fcvtzs x9, h5
-; CHECK-NEXT: stp x11, x10, [sp, #16]
+; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8
; CHECK-NEXT: fcvtzs x10, h2
-; CHECK-NEXT: mov z3.h, z1.h[1]
-; CHECK-NEXT: mov z4.h, z1.h[3]
-; CHECK-NEXT: fcvtzs x11, h1
+; CHECK-NEXT: fcvtzs x11, h4
+; CHECK-NEXT: fcvtzs x12, h6
; CHECK-NEXT: mov z1.h, z1.h[2]
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: stp x12, x8, [sp, #64]
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: fcvtzs x8, h4
-; CHECK-NEXT: stp x10, x9, [sp, #80]
-; CHECK-NEXT: fcvtzs x9, h1
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: stp x11, x12, [sp, #32]
-; CHECK-NEXT: fcvtzs x11, h2
-; CHECK-NEXT: fcvtzs x12, h3
-; CHECK-NEXT: stp x9, x8, [sp, #48]
-; CHECK-NEXT: fcvtzs x8, h0
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: ldp q3, q4, [sp, #64]
-; CHECK-NEXT: stp x10, x11, [sp, #96]
-; CHECK-NEXT: ldp q6, q7, [sp, #32]
-; CHECK-NEXT: stp x8, x12, [sp, #112]
-; CHECK-NEXT: ldp q5, q2, [sp, #96]
-; CHECK-NEXT: stp q0, q1, [x1, #32]
-; CHECK-NEXT: stp q6, q7, [x1]
-; CHECK-NEXT: stp q3, q4, [x1, #96]
-; CHECK-NEXT: stp q5, q2, [x1, #64]
-; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: fmov d16, x9
+; CHECK-NEXT: mov z2.h, z3.h[3]
+; CHECK-NEXT: mov z4.h, z5.h[3]
+; CHECK-NEXT: fcvtzs x14, h3
+; CHECK-NEXT: fcvtzs x13, h1
+; CHECK-NEXT: fcvtzs x15, h5
+; CHECK-NEXT: mov z1.h, z3.h[1]
+; CHECK-NEXT: mov z6.h, z5.h[1]
+; CHECK-NEXT: mov z5.h, z5.h[2]
+; CHECK-NEXT: mov z3.h, z3.h[2]
+; CHECK-NEXT: fcvtzs x9, h2
+; CHECK-NEXT: fmov d2, x10
+; CHECK-NEXT: fcvtzs x10, h4
+; CHECK-NEXT: fmov d4, x11
+; CHECK-NEXT: fcvtzs x11, h7
+; CHECK-NEXT: fmov d7, x12
+; CHECK-NEXT: fcvtzs x12, h0
+; CHECK-NEXT: fmov d0, x13
+; CHECK-NEXT: fcvtzs x13, h1
+; CHECK-NEXT: fmov d1, x14
+; CHECK-NEXT: fcvtzs x14, h6
+; CHECK-NEXT: fmov d6, x15
+; CHECK-NEXT: fcvtzs x15, h5
+; CHECK-NEXT: fmov d5, x9
+; CHECK-NEXT: fcvtzs x9, h3
+; CHECK-NEXT: zip1 z4.d, z16.d, z4.d
+; CHECK-NEXT: fmov d16, x8
+; CHECK-NEXT: zip1 z0.d, z0.d, z7.d
+; CHECK-NEXT: fmov d3, x12
+; CHECK-NEXT: fmov d7, x10
+; CHECK-NEXT: stp q4, q0, [x1, #64]
+; CHECK-NEXT: fmov d0, x14
+; CHECK-NEXT: fmov d4, x9
+; CHECK-NEXT: zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT: fmov d3, x11
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z4.d, z4.d, z5.d
+; CHECK-NEXT: zip1 z3.d, z16.d, z3.d
+; CHECK-NEXT: fmov d16, x15
+; CHECK-NEXT: stp q3, q2, [x1]
+; CHECK-NEXT: fmov d2, x13
+; CHECK-NEXT: zip1 z7.d, z16.d, z7.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z2.d
+; CHECK-NEXT: stp q0, q7, [x1, #96]
+; CHECK-NEXT: stp q1, q4, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
; CHECK-LABEL: fcvtzs_v4f64_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: mov z2.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z1.s[1]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: mov z2.s, z1.s[1]
+; CHECK-NEXT: mov z3.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
@@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
; CHECK-LABEL: fcvtzs_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldp q1, q0, [x0, #32]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldp q2, q3, [x0]
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.s, z3.s[1]
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT: mov z4.s, z0.s[1]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z3.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z5.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z6.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z7.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: zip1 z1.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
@@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: fcvtzs_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q0, q1, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x0, #96]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: ldp q4, q5, [x0, #96]
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: ldp q0, q4, [x0, #32]
+; CHECK-NEXT: ldp q2, q7, [x0, #64]
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d
; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT: ldp q6, q7, [x0, #64]
; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s
; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s
; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z16.s, z1.s[1]
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z0.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z3.s[1]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: movprfx z3, z7
-; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s
-; CHECK-NEXT: mov z3.s, z5.s[1]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z2.s[1]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: ldp q1, q0, [sp]
-; CHECK-NEXT: stp q1, q0, [x1]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: mov z17.s, z6.s[1]
+; CHECK-NEXT: mov z16.s, z4.s[1]
+; CHECK-NEXT: mov z18.s, z5.s[1]
+; CHECK-NEXT: mov z21.s, z0.s[1]
+; CHECK-NEXT: mov z19.s, z7.s[1]
+; CHECK-NEXT: mov z20.s, z2.s[1]
+; CHECK-NEXT: mov z22.s, z3.s[1]
+; CHECK-NEXT: mov z23.s, z1.s[1]
+; CHECK-NEXT: zip1 z6.h, z6.h, z17.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z5.h, z18.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z21.h
+; CHECK-NEXT: zip1 z7.h, z7.h, z19.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z20.h
+; CHECK-NEXT: zip1 z3.h, z3.h, z22.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z23.h
+; CHECK-NEXT: zip1 z5.s, z5.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z4.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.d, z2.d, z5.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: stp q0, q2, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 035c76b569298a..ad5f91a5f39a49 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -8,25 +8,18 @@ target triple = "aarch64-unknown-linux-gnu"
define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) {
; CHECK-LABEL: select_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
; CHECK-NEXT: mov z3.s, z2.s[1]
-; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d2, [sp, #8]
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
; CHECK-NEXT: lsl z2.h, z2.h, #15
; CHECK-NEXT: asr z2.h, z2.h, #15
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: select_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index d77473ed8f08e5..275d13ebfd9491 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -506,14 +506,10 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
; CHECK-LABEL: insertelement_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: insertelement_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 0c712a15d4de2f..e595686cb4975d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1140,18 +1140,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: ucvtf_v2i64_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
; CHECK-NEXT: ucvtf h0, x8
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: ucvtf h1, x8
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ucvtf h1, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
@@ -2598,18 +2594,14 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
; CHECK-LABEL: scvtf_v2i64_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
; CHECK-NEXT: scvtf h0, x8
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: scvtf h1, x8
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: scvtf h1, x9
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 270f05a806b82d..613543310f2c31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -10,25 +10,20 @@ declare void @def(ptr)
define void @alloc_v4i8(ptr %st_ptr) nounwind {
; CHECK-LABEL: alloc_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #28
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: add x20, sp, #28
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x20, sp, #12
; CHECK-NEXT: bl def
; CHECK-NEXT: ptrue p0.b, vl2
; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20]
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: mov z2.b, z0.b[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
; CHECK-NEXT: st1b { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: alloc_v4i8:
@@ -62,32 +57,28 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-LABEL: alloc_v6i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x0, sp, #8
; CHECK-NEXT: bl def
-; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d0, [sp, #8]
; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: add x8, sp, #4
; CHECK-NEXT: ptrue p1.s, vl2
; CHECK-NEXT: mov z1.b, z0.b[3]
-; CHECK-NEXT: mov z2.b, z0.b[5]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: add x8, sp, #20
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: st1b { z0.h }, p0, [x8]
-; CHECK-NEXT: ld1h { z0.s }, p1/z, [x8]
-; CHECK-NEXT: strb w9, [x19, #2]
+; CHECK-NEXT: mov z2.b, z0.b[1]
+; CHECK-NEXT: mov z0.b, z0.b[5]
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
+; CHECK-NEXT: st1b { z1.h }, p0, [x8]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x19, #2]
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strh w8, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: alloc_v6i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 5f4b9dd1592cf2..9055b2efba3282 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1466,23 +1466,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
; CHECK-LABEL: masked_load_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v2f16:
@@ -2318,33 +2313,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
; CHECK-LABEL: masked_load_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z1.b, z0.b[3]
; CHECK-NEXT: mov z2.b, z0.b[2]
+; CHECK-NEXT: mov x8, #4 // =0x4
; CHECK-NEXT: mov z3.b, z0.b[1]
; CHECK-NEXT: mov z4.b, z0.b[7]
-; CHECK-NEXT: strh w8, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.b, z0.b[6]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[5]
-; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #4]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w9, [sp, #14]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: mov x8, #4 // =0x4
-; CHECK-NEXT: ldp d0, d1, [sp]
+; CHECK-NEXT: mov z5.b, z0.b[6]
+; CHECK-NEXT: mov z6.b, z0.b[5]
+; CHECK-NEXT: mov z7.b, z0.b[4]
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z2.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z3.h, z7.h, z6.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z1.s, z3.s, z2.s
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: lsl z0.s, z0.s, #31
@@ -2357,7 +2340,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_v8f32:
@@ -2684,23 +2666,21 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-LABEL: masked_load_zext_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: strh w3, [sp, #12]
+; CHECK-NEXT: fmov s0, w2
+; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: strh w2, [sp, #10]
-; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: strh w1, [sp, #8]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: and z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: fmov s1, w3
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
@@ -2759,23 +2739,21 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
; CHECK-LABEL: masked_load_sext_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: strh w3, [sp, #12]
+; CHECK-NEXT: fmov s0, w2
+; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: strh w2, [sp, #10]
-; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT: strh w1, [sp, #8]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: and z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: fmov s1, w3
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: and z0.d, z0.d, z1.d
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 0c3411e5f55148..265480b571970f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -589,23 +589,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z1.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
; CHECK-NEXT: lsl z0.h, z0.h, #15
; CHECK-NEXT: asr z0.h, z0.h, #15
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: mov z0.h, #0 // =0x0
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v2f16:
@@ -1014,48 +1009,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
; CHECK-LABEL: masked_store_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.b, z0.b[7]
; CHECK-NEXT: mov z2.b, z0.b[6]
+; CHECK-NEXT: mov x8, #4 // =0x4
; CHECK-NEXT: mov z3.b, z0.b[5]
; CHECK-NEXT: mov z4.b, z0.b[4]
+; CHECK-NEXT: mov z5.b, z0.b[3]
+; CHECK-NEXT: mov z6.b, z0.b[2]
+; CHECK-NEXT: mov z7.b, z0.b[1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: mov z4.b, z0.b[1]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: mov x8, #4 // =0x4
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z2.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z7.h
+; CHECK-NEXT: zip1 z1.s, z2.s, z1.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: lsl z1.s, z1.s, #31
+; CHECK-NEXT: lsl z0.s, z0.s, #31
; CHECK-NEXT: asr z1.s, z1.s, #31
+; CHECK-NEXT: asr z0.s, z0.s, #31
; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0
; CHECK-NEXT: mov z1.s, #0 // =0x0
-; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr d0, [sp]
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: lsl z0.s, z0.s, #31
-; CHECK-NEXT: asr z0.s, z0.s, #31
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: st1w { z1.s }, p0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: masked_store_v8f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index b91f813c5141bb..8b296d9fbc215d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu"
define void @zip1_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: mov z3.b, z0.b[14]
-; CHECK-NEXT: mov z4.b, z0.b[13]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.b, z0.b[11]
-; CHECK-NEXT: mov z2.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[10]
-; CHECK-NEXT: strb w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[8]
-; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: mov z4.b, z0.b[14]
+; CHECK-NEXT: mov z6.b, z0.b[13]
+; CHECK-NEXT: mov z3.b, z1.b[15]
+; CHECK-NEXT: mov z5.b, z1.b[14]
+; CHECK-NEXT: mov z7.b, z1.b[13]
+; CHECK-NEXT: mov z16.b, z0.b[12]
+; CHECK-NEXT: mov z17.b, z1.b[12]
+; CHECK-NEXT: mov z18.b, z0.b[11]
+; CHECK-NEXT: mov z19.b, z1.b[11]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: mov z21.b, z1.b[10]
+; CHECK-NEXT: mov z22.b, z0.b[9]
+; CHECK-NEXT: mov z23.b, z1.b[9]
+; CHECK-NEXT: mov z24.b, z0.b[8]
+; CHECK-NEXT: mov z25.b, z1.b[8]
+; CHECK-NEXT: zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT: zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT: zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT: zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT: zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT: zip1 z16.b, z22.b, z23.b
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[9]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[8]
-; CHECK-NEXT: strb w9, [sp, #5]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z4.h, z7.h, z6.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v32i8:
@@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-LABEL: zip_v32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: ldp q0, q4, [x0]
-; CHECK-NEXT: ldp q2, q5, [x0, #32]
-; CHECK-NEXT: mov z16.h, z3.h[7]
-; CHECK-NEXT: mov z18.h, z3.h[6]
-; CHECK-NEXT: mov z17.h, z4.h[7]
-; CHECK-NEXT: ldp q6, q7, [x1, #32]
-; CHECK-NEXT: mov z19.h, z4.h[6]
-; CHECK-NEXT: fmov w8, s16
+; CHECK-NEXT: .cfi_offset b8, -8
+; CHECK-NEXT: .cfi_offset b9, -16
+; CHECK-NEXT: .cfi_offset b10, -24
+; CHECK-NEXT: .cfi_offset b11, -32
+; CHECK-NEXT: .cfi_offset b12, -40
+; CHECK-NEXT: .cfi_offset b13, -48
+; CHECK-NEXT: .cfi_offset b14, -56
+; CHECK-NEXT: .cfi_offset b15, -64
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ldp q2, q3, [x1]
+; CHECK-NEXT: mov z5.h, z1.h[7]
+; CHECK-NEXT: mov z7.h, z1.h[6]
+; CHECK-NEXT: mov z17.h, z1.h[5]
+; CHECK-NEXT: mov z4.h, z3.h[7]
+; CHECK-NEXT: mov z6.h, z3.h[6]
; CHECK-NEXT: mov z16.h, z3.h[5]
-; CHECK-NEXT: fmov w9, s17
-; CHECK-NEXT: mov z17.h, z4.h[5]
-; CHECK-NEXT: mov z20.h, z7.h[6]
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s18
+; CHECK-NEXT: mov z20.h, z2.h[7]
+; CHECK-NEXT: mov z21.h, z0.h[7]
; CHECK-NEXT: mov z18.h, z3.h[4]
-; CHECK-NEXT: strh w9, [sp, #28]
-; CHECK-NEXT: fmov w9, s19
-; CHECK-NEXT: mov z19.h, z5.h[7]
-; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z4.h[4]
-; CHECK-NEXT: strh w9, [sp, #24]
-; CHECK-NEXT: zip1 z4.h, z5.h, z7.h
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z1.h[7]
-; CHECK-NEXT: add z3.h, z3.h, z4.h
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z0.h[7]
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z1.h[6]
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: mov z16.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: mov z17.h, z1.h[4]
-; CHECK-NEXT: strh w8, [sp, #56]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z0.h[4]
-; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: zip1 z1.h, z2.h, z6.h
-; CHECK-NEXT: strh w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s16
-; CHECK-NEXT: ldr q16, [sp, #16]
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: strh w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z7.h[7]
-; CHECK-NEXT: strh w8, [sp, #48]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z5.h[6]
-; CHECK-NEXT: ldr q17, [sp, #48]
-; CHECK-NEXT: strh w8, [sp, #46]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z7.h[5]
-; CHECK-NEXT: strh w8, [sp, #44]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z5.h[5]
-; CHECK-NEXT: strh w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z7.h[4]
-; CHECK-NEXT: strh w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z5.h[4]
-; CHECK-NEXT: strh w8, [sp, #38]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z6.h[7]
-; CHECK-NEXT: strh w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z2.h[7]
-; CHECK-NEXT: strh w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z6.h[6]
-; CHECK-NEXT: strh w8, [sp, #32]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z2.h[6]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z6.h[5]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s19
-; CHECK-NEXT: mov z19.h, z2.h[5]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: mov z20.h, z6.h[4]
-; CHECK-NEXT: fmov w9, s19
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: mov z18.h, z2.h[4]
-; CHECK-NEXT: strh w9, [sp, #4]
-; CHECK-NEXT: ldr q2, [sp, #32]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s20
-; CHECK-NEXT: fmov w9, s18
-; CHECK-NEXT: add z2.h, z16.h, z2.h
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: ldr q4, [sp]
-; CHECK-NEXT: stp q3, q2, [x0, #32]
-; CHECK-NEXT: add z1.h, z17.h, z4.h
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov z19.h, z1.h[4]
+; CHECK-NEXT: mov z22.h, z2.h[6]
+; CHECK-NEXT: mov z23.h, z0.h[6]
+; CHECK-NEXT: zip1 z24.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z25.h, z7.h, z6.h
+; CHECK-NEXT: zip1 z17.h, z17.h, z16.h
+; CHECK-NEXT: ldp q4, q6, [x0, #32]
+; CHECK-NEXT: zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT: ldp q5, q7, [x1, #32]
+; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z19.s, z25.s, z24.s
+; CHECK-NEXT: zip1 z22.h, z23.h, z22.h
+; CHECK-NEXT: mov z23.h, z2.h[5]
+; CHECK-NEXT: mov z21.h, z6.h[7]
+; CHECK-NEXT: mov z24.h, z0.h[5]
+; CHECK-NEXT: mov z25.h, z2.h[4]
+; CHECK-NEXT: mov z20.h, z7.h[7]
+; CHECK-NEXT: mov z26.h, z0.h[4]
+; CHECK-NEXT: mov z27.h, z6.h[6]
+; CHECK-NEXT: mov z28.h, z7.h[5]
+; CHECK-NEXT: mov z29.h, z6.h[5]
+; CHECK-NEXT: mov z30.h, z7.h[4]
+; CHECK-NEXT: mov z31.h, z6.h[4]
+; CHECK-NEXT: mov z8.h, z5.h[7]
+; CHECK-NEXT: mov z9.h, z4.h[7]
+; CHECK-NEXT: zip1 z20.h, z21.h, z20.h
+; CHECK-NEXT: mov z21.h, z7.h[6]
+; CHECK-NEXT: mov z10.h, z5.h[6]
+; CHECK-NEXT: mov z11.h, z4.h[6]
+; CHECK-NEXT: mov z12.h, z5.h[5]
+; CHECK-NEXT: mov z13.h, z4.h[5]
+; CHECK-NEXT: mov z14.h, z5.h[4]
+; CHECK-NEXT: mov z15.h, z4.h[4]
+; CHECK-NEXT: zip1 z23.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z21.h, z27.h, z21.h
+; CHECK-NEXT: zip1 z27.h, z29.h, z28.h
+; CHECK-NEXT: zip1 z28.h, z31.h, z30.h
+; CHECK-NEXT: zip1 z24.h, z26.h, z25.h
+; CHECK-NEXT: zip1 z25.h, z9.h, z8.h
+; CHECK-NEXT: zip1 z26.h, z11.h, z10.h
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z29.h, z13.h, z12.h
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z30.h, z15.h, z14.h
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z17.s, z18.s, z17.s
+; CHECK-NEXT: zip1 z18.s, z21.s, z20.s
+; CHECK-NEXT: zip1 z20.s, z28.s, z27.s
+; CHECK-NEXT: zip1 z16.s, z22.s, z16.s
+; CHECK-NEXT: zip1 z21.s, z24.s, z23.s
+; CHECK-NEXT: zip1 z1.h, z1.h, z3.h
+; CHECK-NEXT: zip1 z3.s, z26.s, z25.s
+; CHECK-NEXT: zip1 z22.s, z30.s, z29.s
+; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z7.d, z17.d, z19.d
+; CHECK-NEXT: zip1 z17.d, z20.d, z18.d
+; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z2.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.d, z21.d, z16.d
+; CHECK-NEXT: zip1 z3.d, z22.d, z3.d
+; CHECK-NEXT: add z1.h, z1.h, z6.h
+; CHECK-NEXT: add z5.h, z7.h, z17.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: add z2.h, z4.h, z3.h
+; CHECK-NEXT: stp q1, q5, [x0, #32]
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip_v32i16:
@@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
define void @zip1_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: mov z3.h, z0.h[6]
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[4]
-; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov z4.h, z0.h[6]
+; CHECK-NEXT: mov z6.h, z0.h[5]
; CHECK-NEXT: mov z3.h, z1.h[7]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z7.h, z1.h[5]
+; CHECK-NEXT: mov z16.h, z0.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[4]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z17.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v16i16:
@@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
define void @zip1_v8i32(ptr %a, ptr %b) {
; CHECK-LABEL: zip1_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1, #16]
@@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z4.s, z5.s
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip1_v8i32:
@@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) {
define void @zip_v4i32(ptr %a, ptr %b) {
; CHECK-LABEL: zip_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x1]
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z3.s, z1.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w9, w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w9, w8, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip_v4i32:
@@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) {
define void @zip2_v32i8(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: mov z3.b, z0.b[14]
-; CHECK-NEXT: mov z4.b, z0.b[13]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.b, z0.b[11]
-; CHECK-NEXT: mov z2.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[10]
-; CHECK-NEXT: strb w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[8]
-; CHECK-NEXT: strb w9, [sp, #8]
+; CHECK-NEXT: mov z4.b, z0.b[14]
+; CHECK-NEXT: mov z6.b, z0.b[13]
+; CHECK-NEXT: mov z3.b, z1.b[15]
+; CHECK-NEXT: mov z5.b, z1.b[14]
+; CHECK-NEXT: mov z7.b, z1.b[13]
+; CHECK-NEXT: mov z16.b, z0.b[12]
+; CHECK-NEXT: mov z17.b, z1.b[12]
+; CHECK-NEXT: mov z18.b, z0.b[11]
+; CHECK-NEXT: mov z19.b, z1.b[11]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: mov z21.b, z1.b[10]
+; CHECK-NEXT: mov z22.b, z0.b[9]
+; CHECK-NEXT: mov z23.b, z1.b[9]
+; CHECK-NEXT: mov z24.b, z0.b[8]
+; CHECK-NEXT: mov z25.b, z1.b[8]
+; CHECK-NEXT: zip1 z2.b, z2.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z4.b, z5.b
+; CHECK-NEXT: zip1 z4.b, z6.b, z7.b
+; CHECK-NEXT: zip1 z5.b, z16.b, z17.b
+; CHECK-NEXT: zip1 z6.b, z18.b, z19.b
+; CHECK-NEXT: zip1 z7.b, z20.b, z21.b
+; CHECK-NEXT: zip1 z16.b, z22.b, z23.b
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[9]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[8]
-; CHECK-NEXT: strb w9, [sp, #5]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z17.b, z24.b, z25.b
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z3.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z4.h, z7.h, z6.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z5.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v32i8:
@@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
define void @zip2_v16i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: mov z3.h, z0.h[6]
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[4]
-; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov z4.h, z0.h[6]
+; CHECK-NEXT: mov z6.h, z0.h[5]
; CHECK-NEXT: mov z3.h, z1.h[7]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z7.h, z1.h[5]
+; CHECK-NEXT: mov z16.h, z0.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[4]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z1.h[5]
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w9, [sp, #10]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z3.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z17.h
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v16i16:
@@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
define void @zip2_v8i32(ptr %a, ptr %b) #0{
; CHECK-LABEL: zip2_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1]
@@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
; CHECK-NEXT: mov z2.s, z0.s[3]
; CHECK-NEXT: mov z4.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.s, z1.s[2]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z4.s, z5.s
; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: zip2_v8i32:
@@ -1547,197 +1432,139 @@ define void @zip2_v8i32_undef(ptr %a) #0{
define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q2, q3, [x0]
-; CHECK-NEXT: ldp q0, q1, [x1]
-; CHECK-NEXT: mov z4.b, z3.b[14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z6.b, z3.b[10]
-; CHECK-NEXT: mov z5.b, z3.b[12]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z7.b, z3.b[8]
-; CHECK-NEXT: mov z17.b, z3.b[9]
-; CHECK-NEXT: mov z18.b, z3.b[7]
-; CHECK-NEXT: mov z16.b, z3.b[11]
-; CHECK-NEXT: strb w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z3.b[6]
-; CHECK-NEXT: strb w9, [sp, #32]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.b, z3.b[4]
-; CHECK-NEXT: strb w8, [sp, #47]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[2]
-; CHECK-NEXT: strb w9, [sp, #46]
-; CHECK-NEXT: fmov w9, s7
-; CHECK-NEXT: mov z7.b, z2.b[14]
-; CHECK-NEXT: strb w8, [sp, #45]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z2.b[12]
-; CHECK-NEXT: strb w9, [sp, #44]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: mov z16.b, z2.b[11]
-; CHECK-NEXT: strb w8, [sp, #43]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z2.b[10]
-; CHECK-NEXT: strb w9, [sp, #61]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: strb w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z2.b[8]
-; CHECK-NEXT: strb w9, [sp, #53]
-; CHECK-NEXT: strb w8, [sp, #41]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[6]
-; CHECK-NEXT: strb w8, [sp, #39]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z2.b[4]
-; CHECK-NEXT: strb w8, [sp, #38]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z2.b[2]
-; CHECK-NEXT: strb w8, [sp, #37]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[10]
-; CHECK-NEXT: strb w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[8]
-; CHECK-NEXT: strb w8, [sp, #35]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[14]
-; CHECK-NEXT: strb w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z1.b[12]
-; CHECK-NEXT: strb w8, [sp, #33]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z1.b[6]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z1.b[4]
-; CHECK-NEXT: strb w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[2]
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z0.b[14]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[12]
-; CHECK-NEXT: strb w8, [sp, #11]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z0.b[10]
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[8]
-; CHECK-NEXT: strb w8, [sp, #9]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z0.b[6]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.b, z0.b[4]
-; CHECK-NEXT: strb w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.b, z0.b[2]
-; CHECK-NEXT: strb w8, [sp, #5]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[15]
-; CHECK-NEXT: strb w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z3.b[13]
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: ldr q4, [sp, #32]
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z3.b[5]
-; CHECK-NEXT: mov z3.b, z3.b[3]
-; CHECK-NEXT: ldr q5, [sp]
-; CHECK-NEXT: strb w8, [sp, #63]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[13]
-; CHECK-NEXT: strb w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: strb w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s18
-; CHECK-NEXT: strb w8, [sp, #59]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z2.b[9]
-; CHECK-NEXT: strb w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z2.b[5]
-; CHECK-NEXT: strb w8, [sp, #57]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z2.b[3]
+; CHECK-NEXT: stp d13, d12, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset b8, -8
+; CHECK-NEXT: .cfi_offset b9, -16
+; CHECK-NEXT: .cfi_offset b10, -24
+; CHECK-NEXT: .cfi_offset b11, -32
+; CHECK-NEXT: .cfi_offset b12, -40
+; CHECK-NEXT: .cfi_offset b13, -48
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: mov z2.b, z1.b[14]
+; CHECK-NEXT: mov z3.b, z1.b[12]
+; CHECK-NEXT: mov z4.b, z1.b[10]
+; CHECK-NEXT: mov z5.b, z1.b[8]
+; CHECK-NEXT: mov z6.b, z1.b[6]
+; CHECK-NEXT: mov z7.b, z1.b[4]
+; CHECK-NEXT: mov z16.b, z1.b[2]
+; CHECK-NEXT: mov z18.b, z0.b[14]
+; CHECK-NEXT: mov z19.b, z0.b[12]
+; CHECK-NEXT: zip1 z3.b, z3.b, z2.b
+; CHECK-NEXT: ldp q2, q17, [x1]
+; CHECK-NEXT: mov z20.b, z0.b[10]
+; CHECK-NEXT: zip1 z4.b, z5.b, z4.b
+; CHECK-NEXT: zip1 z5.b, z7.b, z6.b
+; CHECK-NEXT: zip1 z6.b, z1.b, z16.b
+; CHECK-NEXT: mov z7.b, z0.b[8]
+; CHECK-NEXT: mov z16.b, z0.b[6]
+; CHECK-NEXT: mov z21.b, z0.b[4]
+; CHECK-NEXT: mov z22.b, z0.b[2]
+; CHECK-NEXT: mov z23.b, z17.b[14]
+; CHECK-NEXT: mov z24.b, z17.b[12]
+; CHECK-NEXT: mov z25.b, z17.b[10]
+; CHECK-NEXT: mov z26.b, z17.b[8]
+; CHECK-NEXT: mov z27.b, z17.b[6]
+; CHECK-NEXT: mov z28.b, z17.b[4]
+; CHECK-NEXT: mov z29.b, z17.b[2]
+; CHECK-NEXT: zip1 z18.b, z19.b, z18.b
+; CHECK-NEXT: zip1 z7.b, z7.b, z20.b
+; CHECK-NEXT: zip1 z16.b, z21.b, z16.b
+; CHECK-NEXT: zip1 z19.b, z0.b, z22.b
+; CHECK-NEXT: zip1 z20.b, z24.b, z23.b
+; CHECK-NEXT: zip1 z21.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z22.b, z28.b, z27.b
+; CHECK-NEXT: mov z24.b, z2.b[14]
+; CHECK-NEXT: mov z25.b, z2.b[12]
+; CHECK-NEXT: mov z26.b, z2.b[10]
+; CHECK-NEXT: mov z27.b, z2.b[8]
+; CHECK-NEXT: zip1 z23.b, z17.b, z29.b
+; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z5.h, z7.h, z18.h
+; CHECK-NEXT: zip1 z6.h, z19.h, z16.h
+; CHECK-NEXT: zip1 z7.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z18.b, z25.b, z24.b
+; CHECK-NEXT: zip1 z19.b, z27.b, z26.b
+; CHECK-NEXT: mov z20.b, z2.b[6]
+; CHECK-NEXT: mov z21.b, z2.b[4]
+; CHECK-NEXT: mov z29.b, z17.b[3]
+; CHECK-NEXT: mov z30.b, z17.b[1]
+; CHECK-NEXT: mov z31.b, z2.b[15]
+; CHECK-NEXT: mov z8.b, z2.b[13]
+; CHECK-NEXT: zip1 z16.h, z23.h, z22.h
+; CHECK-NEXT: mov z22.b, z2.b[2]
+; CHECK-NEXT: mov z23.b, z17.b[15]
+; CHECK-NEXT: mov z24.b, z17.b[13]
+; CHECK-NEXT: mov z25.b, z17.b[11]
+; CHECK-NEXT: mov z26.b, z17.b[9]
+; CHECK-NEXT: mov z27.b, z17.b[7]
+; CHECK-NEXT: mov z28.b, z17.b[5]
+; CHECK-NEXT: zip1 z17.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z21.b, z21.b, z20.b
+; CHECK-NEXT: zip1 z19.b, z30.b, z29.b
+; CHECK-NEXT: zip1 z20.b, z8.b, z31.b
+; CHECK-NEXT: mov z29.b, z1.b[15]
+; CHECK-NEXT: mov z30.b, z1.b[13]
+; CHECK-NEXT: mov z31.b, z1.b[11]
+; CHECK-NEXT: mov z8.b, z1.b[9]
+; CHECK-NEXT: zip1 z22.b, z2.b, z22.b
+; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
+; CHECK-NEXT: zip1 z24.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z18.b, z28.b, z27.b
+; CHECK-NEXT: mov z25.b, z2.b[11]
+; CHECK-NEXT: mov z26.b, z2.b[9]
+; CHECK-NEXT: mov z27.b, z2.b[7]
+; CHECK-NEXT: mov z28.b, z2.b[5]
+; CHECK-NEXT: mov z9.b, z1.b[7]
+; CHECK-NEXT: mov z10.b, z1.b[5]
+; CHECK-NEXT: mov z1.b, z1.b[3]
+; CHECK-NEXT: mov z11.b, z0.b[11]
+; CHECK-NEXT: mov z12.b, z0.b[9]
+; CHECK-NEXT: zip1 z29.b, z30.b, z29.b
+; CHECK-NEXT: mov z30.b, z0.b[3]
+; CHECK-NEXT: mov z13.b, z0.b[1]
+; CHECK-NEXT: zip1 z31.b, z8.b, z31.b
+; CHECK-NEXT: mov z8.b, z2.b[3]
; CHECK-NEXT: mov z2.b, z2.b[1]
-; CHECK-NEXT: strb w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[15]
-; CHECK-NEXT: strb w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[13]
-; CHECK-NEXT: strb w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[11]
-; CHECK-NEXT: strb w8, [sp, #49]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z1.b[9]
-; CHECK-NEXT: strb w8, [sp, #48]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z1.b[7]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[15]
-; CHECK-NEXT: strb w8, [sp, #31]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z1.b[5]
-; CHECK-NEXT: strb w9, [sp, #28]
-; CHECK-NEXT: strb w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.b, z1.b[3]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: strb w8, [sp, #29]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[11]
-; CHECK-NEXT: strb w8, [sp, #27]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[13]
-; CHECK-NEXT: strb w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: strb w8, [sp, #25]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.b, z0.b[9]
-; CHECK-NEXT: strb w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.b, z0.b[7]
-; CHECK-NEXT: strb w8, [sp, #23]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z3.b, z0.b[5]
-; CHECK-NEXT: strb w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.b, z0.b[3]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: strb w8, [sp, #21]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #19]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strb w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strb w8, [sp, #17]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ldr q0, [sp, #48]
-; CHECK-NEXT: add z0.b, z4.b, z0.b
-; CHECK-NEXT: strb w8, [sp, #16]
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: add z1.b, z5.b, z1.b
+; CHECK-NEXT: zip1 z9.b, z10.b, z9.b
+; CHECK-NEXT: zip1 z10.b, z12.b, z11.b
+; CHECK-NEXT: zip1 z1.b, z0.b, z1.b
+; CHECK-NEXT: zip1 z30.b, z13.b, z30.b
+; CHECK-NEXT: mov z11.b, z0.b[13]
+; CHECK-NEXT: mov z0.b, z0.b[5]
+; CHECK-NEXT: zip1 z25.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z26.b, z28.b, z27.b
+; CHECK-NEXT: zip1 z2.b, z2.b, z8.b
+; CHECK-NEXT: zip1 z21.h, z22.h, z21.h
+; CHECK-NEXT: zip1 z22.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z23.h, z31.h, z29.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z9.h
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z24.h, z10.h, z11.h
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 z0.h, z30.h, z0.h
+; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z19.h, z25.h, z20.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z26.h
+; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT: zip1 z4.s, z6.s, z5.s
+; CHECK-NEXT: zip1 z5.s, z16.s, z7.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z23.s
+; CHECK-NEXT: zip1 z6.s, z21.s, z17.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z24.s
+; CHECK-NEXT: zip1 z7.s, z18.s, z22.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z19.s
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: zip1 z1.d, z6.d, z5.d
+; CHECK-NEXT: zip1 z2.d, z2.d, z7.d
+; CHECK-NEXT: add z0.b, z3.b, z0.b
+; CHECK-NEXT: add z1.b, z1.b, z2.b
; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: ldp d13, d12, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v32i8:
@@ -1922,110 +1749,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: ldp q2, q3, [x0]
-; CHECK-NEXT: ldp q0, q1, [x1]
-; CHECK-NEXT: mov z4.h, z3.h[6]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov z6.h, z3.h[2]
-; CHECK-NEXT: mov z5.h, z3.h[4]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z7.h, z2.h[6]
-; CHECK-NEXT: mov z17.h, z2.h[7]
-; CHECK-NEXT: mov z16.h, z3.h[1]
-; CHECK-NEXT: strh w8, [sp, #40]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z2.h[4]
-; CHECK-NEXT: strh w9, [sp, #32]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.h, z2.h[2]
-; CHECK-NEXT: strh w8, [sp, #46]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z1.h[2]
-; CHECK-NEXT: strh w9, [sp, #44]
-; CHECK-NEXT: fmov w9, s7
-; CHECK-NEXT: mov z7.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #42]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[6]
-; CHECK-NEXT: strh w9, [sp, #38]
-; CHECK-NEXT: fmov w9, s16
-; CHECK-NEXT: strh w8, [sp, #36]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z1.h[4]
-; CHECK-NEXT: strh w9, [sp, #56]
-; CHECK-NEXT: strh w8, [sp, #34]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z0.h[4]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[2]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z3.h[7]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.h, z3.h[5]
-; CHECK-NEXT: strh w8, [sp, #6]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z3.h[3]
-; CHECK-NEXT: ldr q3, [sp, #32]
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z2.h[5]
-; CHECK-NEXT: ldr q4, [sp]
-; CHECK-NEXT: strh w8, [sp, #62]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: mov z7.h, z1.h[7]
-; CHECK-NEXT: strh w8, [sp, #60]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[1]
-; CHECK-NEXT: strh w8, [sp, #58]
-; CHECK-NEXT: fmov w8, s17
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: strh w8, [sp, #54]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z1.h[5]
-; CHECK-NEXT: strh w9, [sp, #48]
-; CHECK-NEXT: strh w8, [sp, #52]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z1.h[3]
+; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset b8, -16
+; CHECK-NEXT: ldp q1, q6, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: mov z3.h, z6.h[6]
+; CHECK-NEXT: mov z4.h, z6.h[4]
+; CHECK-NEXT: mov z5.h, z6.h[2]
+; CHECK-NEXT: mov z7.h, z1.h[6]
+; CHECK-NEXT: mov z16.h, z1.h[4]
+; CHECK-NEXT: mov z17.h, z1.h[2]
+; CHECK-NEXT: mov z18.h, z2.h[6]
+; CHECK-NEXT: mov z19.h, z2.h[4]
+; CHECK-NEXT: mov z20.h, z2.h[2]
+; CHECK-NEXT: mov z21.h, z0.h[6]
+; CHECK-NEXT: mov z22.h, z0.h[4]
+; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT: zip1 z7.h, z1.h, z17.h
+; CHECK-NEXT: zip1 z16.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z18.h, z2.h, z20.h
+; CHECK-NEXT: mov z19.h, z0.h[2]
+; CHECK-NEXT: zip1 z17.h, z22.h, z21.h
+; CHECK-NEXT: mov z20.h, z6.h[7]
+; CHECK-NEXT: mov z21.h, z6.h[5]
+; CHECK-NEXT: mov z22.h, z6.h[3]
+; CHECK-NEXT: mov z6.h, z6.h[1]
+; CHECK-NEXT: mov z23.h, z1.h[7]
+; CHECK-NEXT: mov z24.h, z1.h[5]
+; CHECK-NEXT: mov z25.h, z1.h[3]
; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: strh w8, [sp, #50]
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: mov z6.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: ldr q0, [sp, #48]
-; CHECK-NEXT: add z0.h, z3.h, z0.h
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: add z1.h, z4.h, z1.h
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov z26.h, z2.h[7]
+; CHECK-NEXT: mov z27.h, z2.h[5]
+; CHECK-NEXT: mov z28.h, z2.h[3]
+; CHECK-NEXT: mov z2.h, z2.h[1]
+; CHECK-NEXT: mov z29.h, z0.h[7]
+; CHECK-NEXT: mov z30.h, z0.h[5]
+; CHECK-NEXT: mov z31.h, z0.h[3]
+; CHECK-NEXT: mov z8.h, z0.h[1]
+; CHECK-NEXT: zip1 z0.h, z0.h, z19.h
+; CHECK-NEXT: zip1 z19.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z6.h, z6.h, z22.h
+; CHECK-NEXT: zip1 z20.h, z24.h, z23.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z25.h
+; CHECK-NEXT: zip1 z21.h, z27.h, z26.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT: zip1 z22.h, z30.h, z29.h
+; CHECK-NEXT: zip1 z23.h, z8.h, z31.h
+; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
+; CHECK-NEXT: zip1 z4.s, z7.s, z5.s
+; CHECK-NEXT: zip1 z5.s, z18.s, z16.s
+; CHECK-NEXT: zip1 z6.s, z6.s, z19.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z20.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z17.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z21.s
+; CHECK-NEXT: zip1 z7.s, z23.s, z22.s
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z5.d
+; CHECK-NEXT: zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT: add z1.h, z3.h, z1.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v16i16:
@@ -2116,32 +1904,28 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
define void @uzp_v8f32(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: ldp q2, q0, [x0]
+; CHECK-NEXT: ldp q6, q0, [x0]
; CHECK-NEXT: adrp x8, .LCPI21_0
-; CHECK-NEXT: ldp q4, q1, [x1]
+; CHECK-NEXT: ldp q1, q2, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z3.s, z0.s[2]
-; CHECK-NEXT: mov z5.s, z1.s[2]
-; CHECK-NEXT: stp s0, s3, [sp, #24]
-; CHECK-NEXT: mov z3.s, z4.s[2]
-; CHECK-NEXT: stp s5, s2, [sp, #12]
-; CHECK-NEXT: mov z5.s, z0.s[3]
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: stp s3, s1, [sp, #4]
-; CHECK-NEXT: mov z1.s, z2.s[1]
-; CHECK-NEXT: str s5, [sp, #44]
+; CHECK-NEXT: mov z4.s, z0.s[3]
+; CHECK-NEXT: mov z5.s, z0.s[1]
+; CHECK-NEXT: mov z7.s, z2.s[2]
+; CHECK-NEXT: mov z16.s, z1.s[2]
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
+; CHECK-NEXT: mov z4.s, z6.s[1]
+; CHECK-NEXT: zip1 z2.s, z2.s, z7.s
; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT: str s0, [sp, #40]
-; CHECK-NEXT: ldp q3, q2, [sp]
-; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s
-; CHECK-NEXT: str s1, [sp, #32]
-; CHECK-NEXT: ldr q1, [sp, #32]
-; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: zip1 z7.s, z0.s, z16.s
+; CHECK-NEXT: tbl z1.s, { z1.s }, z5.s
+; CHECK-NEXT: zip1 z0.d, z6.d, z0.d
+; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT: zip1 z2.d, z7.d, z2.d
; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: stp q1, q0, [x0]
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8f32:
@@ -2231,60 +2015,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
define void @uzp_v8i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov z2.h, z1.h[6]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z4.h, z1.h[2]
-; CHECK-NEXT: mov z6.h, z0.h[4]
-; CHECK-NEXT: mov z3.h, z1.h[4]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov z5.h, z0.h[6]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: strh w9, [sp]
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z3.h, z1.h[7]
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z1.h[5]
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: mov z5.h, z1.h[3]
-; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: strh w9, [sp, #6]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #4]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov z2.h, z0.h[7]
-; CHECK-NEXT: strh w9, [sp, #24]
-; CHECK-NEXT: strh w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w8, [sp, #30]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov z4.h, z0.h[5]
-; CHECK-NEXT: strh w8, [sp, #28]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov z5.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[1]
-; CHECK-NEXT: strh w8, [sp, #26]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strh w8, [sp, #22]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: strh w8, [sp, #18]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strh w8, [sp, #16]
-; CHECK-NEXT: ldp q3, q0, [sp]
-; CHECK-NEXT: add z0.h, z3.h, z0.h
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: mov z2.h, z0.h[6]
+; CHECK-NEXT: mov z3.h, z0.h[4]
+; CHECK-NEXT: mov z4.h, z0.h[2]
+; CHECK-NEXT: mov z5.h, z1.h[6]
+; CHECK-NEXT: mov z6.h, z1.h[4]
+; CHECK-NEXT: mov z7.h, z1.h[2]
+; CHECK-NEXT: mov z16.h, z0.h[7]
+; CHECK-NEXT: mov z17.h, z0.h[5]
+; CHECK-NEXT: mov z18.h, z0.h[3]
+; CHECK-NEXT: mov z19.h, z0.h[1]
+; CHECK-NEXT: mov z20.h, z1.h[7]
+; CHECK-NEXT: mov z21.h, z1.h[5]
+; CHECK-NEXT: mov z22.h, z1.h[3]
+; CHECK-NEXT: mov z23.h, z1.h[1]
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z3.h, z6.h, z5.h
+; CHECK-NEXT: zip1 z1.h, z1.h, z7.h
+; CHECK-NEXT: zip1 z4.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z5.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z6.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z7.h, z23.h, z22.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
+; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8i16:
@@ -2341,31 +2103,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
define void @uzp_v8i32_undef(ptr %a) #0{
; CHECK-LABEL: uzp_v8i32_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.s, z0.s[2]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z3.s, z1.s[2]
-; CHECK-NEXT: mov z4.s, z0.s[3]
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.s, z1.s[3]
-; CHECK-NEXT: stp w8, w9, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: mov z1.s, z1.s[1]
-; CHECK-NEXT: stp w8, w9, [sp]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: stp w9, w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: stp w9, w8, [sp, #16]
-; CHECK-NEXT: ldp q0, q1, [sp]
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: mov z2.s, z1.s[2]
+; CHECK-NEXT: mov z3.s, z0.s[2]
+; CHECK-NEXT: mov z4.s, z1.s[3]
+; CHECK-NEXT: mov z5.s, z1.s[1]
+; CHECK-NEXT: mov z6.s, z0.s[3]
+; CHECK-NEXT: mov z7.s, z0.s[1]
+; CHECK-NEXT: zip1 z1.s, z1.s, z2.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z3.s
+; CHECK-NEXT: zip1 z2.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z3.s, z7.s, z6.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: zip1 z1.d, z3.d, z2.d
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: stp q0, q0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 88c83a214c7394..c942f1eca8ebaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
; CHECK-NEXT: mov z1.s, z0.s[3]
-; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: mov z2.s, z0.s[2]
; CHECK-NEXT: mov z3.s, z0.s[1]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: zip1 z1.h, z2.h, z1.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%el0 = extractelement <vscale x 4 x i1> %a, i32 0
%el1 = extractelement <vscale x 4 x i1> %a, i32 1
More information about the llvm-commits
mailing list