[llvm] 93831c7 - [AArch64] Let patterns for NEON instructions check runtime mode. (#95560)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 19 06:24:32 PDT 2024
Author: Sander de Smalen
Date: 2024-06-19T14:24:28+01:00
New Revision: 93831c73ea51dcf4dc1832a4ea5616b819d36f31
URL: https://github.com/llvm/llvm-project/commit/93831c73ea51dcf4dc1832a4ea5616b819d36f31
DIFF: https://github.com/llvm/llvm-project/commit/93831c73ea51dcf4dc1832a4ea5616b819d36f31.diff
LOG: [AArch64] Let patterns for NEON instructions check runtime mode. (#95560)
This helps identify any failures where the compiler might otherwise
silently emit instructions that are not valid for the given runtime mode.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
Removed:
llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
llvm/test/MC/AArch64/SME/streaming-mode-neon.s
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f6f66e9e0c70..0f0606c49a570 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22368,7 +22368,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
SmallVector<SDValue, 16> MaskConstants;
- if (VecVT == MVT::v16i8) {
+ if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+ VecVT == MVT::v16i8) {
// v16i8 is a special case, as we have 16 entries but only 8 positional bits
// per entry. We split it into two halves, apply the mask, zip the halves to
// create 8x 16-bit values, and the perform the vector reduce.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 17d011086634c..e1ecc5a57dd26 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7602,13 +7602,12 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
}
let mayRaiseFPException = 1, Uses = [FPCR] in
-multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
- Predicate pred = HasNEON> {
- let Predicates = [pred] in {
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
}
- let Predicates = [pred, HasFullFP16] in {
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
}
}
@@ -7616,11 +7615,13 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ }
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
}
@@ -7880,7 +7881,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
multiclass SMov {
// SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -7927,7 +7928,7 @@ multiclass SMov {
multiclass UMov {
// UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 91e5bc3caa102..6afee9bd388a6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -107,7 +107,7 @@ def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">,
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
-def HasNEON : Predicate<"Subtarget->hasNEON()">,
+def HasNEON : Predicate<"Subtarget->isNeonAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
@@ -235,11 +235,10 @@ def HasSMEF16F16orSMEF8F16
"sme-f16f16 or sme-f8f16">;
// A subset of NEON instructions are legal in Streaming SVE execution mode,
-// they should be enabled if either has been specified.
-def HasNEONorSME
- : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
- AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
- "neon or sme">;
+// so don't need the additional check for 'isNeonAvailable'.
+def HasNEONandIsStreamingSafe
+ : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -323,8 +322,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
-def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
-
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;
@@ -1350,7 +1347,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
VectorIndexS:$idx)>;
}
-let Predicates = [HasNEONorSME, HasBF16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Round FP32 to BF16.
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5789,9 +5786,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
-defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
-defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5820,7 +5817,7 @@ let Predicates = [HasRDM] in {
defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
int_aarch64_neon_fmulx,
- [HasNEONorSME]>;
+ [HasNEONandIsStreamingSafe]>;
let Predicates = [HasNEON] in {
def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5894,9 +5891,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>;
-defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>;
-defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5915,7 +5912,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
(CMLTv1i64rz V64:$Rn)>;
// Round FP64 to BF16.
-let Predicates = [HasNEONorSME, HasBF16] in
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
(BFCVT (FCVTXNv1i64 $Rn))>;
@@ -6016,7 +6013,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEON] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6026,7 +6023,7 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
-let Predicates = [HasFullFP16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6118,7 +6115,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
-} // let Predicates = [HasNEON]
+} // let Predicates = [HasNEONandIsStreamingSafe]
//===----------------------------------------------------------------------===//
// Advanced SIMD three
diff erent-sized vector instructions.
@@ -8379,7 +8376,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
// Same as above, but the first element is populated using
// scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [IsNeonAvailable] in {
+let Predicates = [HasNEON] in {
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
SDPatternOperator ExtLoad, Instruction LD1>
: Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd5de628d8529..a3c41f2e052cd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in {
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
// Extract element from vector with immediate index that's within the bottom 128-bits.
- let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
+ let Predicates = [HasNEON], AddedComplexity = 1 in {
def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
(UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
@@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in {
(UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
(UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
- } // End IsNeonAvailable
+ } // End HasNEON
- let Predicates = [IsNeonAvailable] in {
+ let Predicates = [HasNEON] in {
def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
(SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
@@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
(SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
- } // End IsNeonAvailable
+ } // End HasNEON
// Extract first element from vector.
let AddedComplexity = 2 in {
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index a689a539b0082..5f4b9dd1592cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -315,92 +315,40 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #1024
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT: sub sp, sp, #992
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1008
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
; NONEON-NOSVE-NEXT: str q0, [sp, #976]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #995]
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #979]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #977]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #980]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #981]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #982]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp, #976]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1012]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1014]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1016]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1018]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1020]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #983]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
; NONEON-NOSVE-NEXT: add x9, sp, #720
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB2_2
; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load
; NONEON-NOSVE-NEXT: ldrb w10, [x0]
@@ -481,7 +429,7 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: strb w8, [sp, #47]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB2_19: // %else44
-; NONEON-NOSVE-NEXT: add sp, sp, #1024
+; NONEON-NOSVE-NEXT: add sp, sp, #992
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load4
@@ -806,166 +754,62 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #2064
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT: sub sp, sp, #2000
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2016
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2216]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2152]
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2272]
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #2176]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #2160]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2024]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2264]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2016]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2256]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2031]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2248]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2030]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2240]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2029]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2232]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2028]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2224]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2027]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2208]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2026]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2200]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2025]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2192]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2023]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2184]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #2096]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #2104]
+; NONEON-NOSVE-NEXT: sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT: ldr w10, [sp, #2112]
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #2088]
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #2120]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2168]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x20
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #2021]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2020]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x8
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2019]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2088]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2018]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2136]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2017]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2144]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2008]
-; NONEON-NOSVE-NEXT: and w8, w1, #0x1
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #2104]
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2000]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #2080]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #2016]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2128]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2015]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2120]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2014]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2112]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x20
+; NONEON-NOSVE-NEXT: ldr w13, [sp, #2136]
; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #2013]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2096]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x10
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #2012]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2011]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x4
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2010]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2009]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x80
-; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2007]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2006]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2005]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2004]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x8
+; NONEON-NOSVE-NEXT: ldr w14, [sp, #2144]
+; NONEON-NOSVE-NEXT: ldr w16, [sp, #2016]
+; NONEON-NOSVE-NEXT: and w8, w8, #0x2
+; NONEON-NOSVE-NEXT: and w9, w9, #0x4
+; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #2128]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w11, #0x10
+; NONEON-NOSVE-NEXT: sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: and w9, w11, #0x20
+; NONEON-NOSVE-NEXT: sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT: and w10, w10, #0x40
+; NONEON-NOSVE-NEXT: sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w12, #0x8
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2003]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2002]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2001]
-; NONEON-NOSVE-NEXT: str q0, [sp, #2048]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #2000]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2050]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2048]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2052]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2054]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2056]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2058]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2060]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w12, w13
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: add w9, w9, w10
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: str q0, [sp, #2032]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2034]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2032]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2036]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #2038]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #2040]
-; NONEON-NOSVE-NEXT: ldrh w16, [sp, #2042]
-; NONEON-NOSVE-NEXT: add w10, w12, w11
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2044]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2062]
-; NONEON-NOSVE-NEXT: add w13, w13, w14
-; NONEON-NOSVE-NEXT: add w14, w15, w16
-; NONEON-NOSVE-NEXT: add w10, w10, w13
-; NONEON-NOSVE-NEXT: add w11, w14, w11
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2046]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w8, w12
-; NONEON-NOSVE-NEXT: add w8, w9, w13
+; NONEON-NOSVE-NEXT: and w11, w11, #0x4
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x10
+; NONEON-NOSVE-NEXT: sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w13, #0x20
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w10, w11
+; NONEON-NOSVE-NEXT: orr w10, w12, w13
+; NONEON-NOSVE-NEXT: and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT: add w11, w8, w11
+; NONEON-NOSVE-NEXT: add w8, w9, w10
; NONEON-NOSVE-NEXT: adrp x9, .LCPI3_0
-; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT: bfi w8, w11, #16, #16
; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI3_0]
; NONEON-NOSVE-NEXT: add x9, sp, #1744
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB3_2
@@ -1083,7 +927,7 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; NONEON-NOSVE-NEXT: strb w8, [sp, #47]
; NONEON-NOSVE-NEXT: ldr q1, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB3_35: // %else92
-; NONEON-NOSVE-NEXT: add sp, sp, #2064
+; NONEON-NOSVE-NEXT: add sp, sp, #2000
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load4
@@ -1996,94 +1840,42 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v16f16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #1024
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT: sub sp, sp, #992
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1008
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
; NONEON-NOSVE-NEXT: str q0, [sp, #976]
; NONEON-NOSVE-NEXT: adrp x9, .LCPI7_0
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #995]
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #979]
+; NONEON-NOSVE-NEXT: ldrb w11, [sp, #977]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #980]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #981]
+; NONEON-NOSVE-NEXT: ldrb w15, [sp, #982]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #976]
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #983]
+; NONEON-NOSVE-NEXT: and w11, w11, #0x2
+; NONEON-NOSVE-NEXT: and w13, w13, #0x10
+; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w14, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w13
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w11, w8
+; NONEON-NOSVE-NEXT: orr w11, w12, w13
; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT: orr w8, w8, w11
+; NONEON-NOSVE-NEXT: and w10, w10, #0xffffff80
; NONEON-NOSVE-NEXT: add x9, sp, #720
-; NONEON-NOSVE-NEXT: str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1012]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1014]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1016]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1018]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #1020]
-; NONEON-NOSVE-NEXT: add w8, w10, w8
-; NONEON-NOSVE-NEXT: add w10, w11, w12
-; NONEON-NOSVE-NEXT: add w11, w13, w14
-; NONEON-NOSVE-NEXT: add w8, w8, w10
-; NONEON-NOSVE-NEXT: add w10, w11, w15
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1022]
; NONEON-NOSVE-NEXT: add w8, w8, w10
-; NONEON-NOSVE-NEXT: add w8, w8, w11
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB7_2
; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load
; NONEON-NOSVE-NEXT: fmov s0, wzr
@@ -2159,7 +1951,7 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: str h1, [sp, #46]
; NONEON-NOSVE-NEXT: ldr q1, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB7_19: // %else44
-; NONEON-NOSVE-NEXT: add sp, sp, #1024
+; NONEON-NOSVE-NEXT: add sp, sp, #992
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 13b83d2ae3f07..0c3411e5f5514 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -172,89 +172,37 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v16i8:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT: str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17
; NONEON-NOSVE-NEXT: // %bb.1: // %else
; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18
@@ -287,7 +235,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB2_15: // %else28
; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32
; NONEON-NOSVE-NEXT: .LBB2_16: // %else30
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.store
; NONEON-NOSVE-NEXT: strb wzr, [x0]
@@ -336,7 +284,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16
; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.store29
; NONEON-NOSVE-NEXT: strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
ret void
@@ -421,328 +369,219 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v32i8:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: sub sp, sp, #80
-; NONEON-NOSVE-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #216]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #152]
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #272]
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #176]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #160]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #264]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #256]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #248]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #240]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #232]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #224]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #208]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #200]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #192]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #22]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #168]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x20
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #21]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x8
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #80]
; NONEON-NOSVE-NEXT: ldr w9, [sp, #88]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #136]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #144]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w1, #0x1
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #104]
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #80]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #128]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #120]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT: sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT: ldr w10, [sp, #96]
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #104]
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #72]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #96]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x10
-; NONEON-NOSVE-NEXT: zip1 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #12]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x4
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x80
-; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x8
+; NONEON-NOSVE-NEXT: ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT: and w8, w8, #0x2
+; NONEON-NOSVE-NEXT: and w9, w9, #0x4
+; NONEON-NOSVE-NEXT: ldr w16, [sp]
+; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #112]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w12, #0x10
+; NONEON-NOSVE-NEXT: sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w12, #0x8
+; NONEON-NOSVE-NEXT: and w9, w11, #0x20
+; NONEON-NOSVE-NEXT: and w10, w10, #0x40
+; NONEON-NOSVE-NEXT: sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: ldr q0, [sp]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #52]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #54]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #56]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #58]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #40]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w12, w13
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w16, [sp, #42]
-; NONEON-NOSVE-NEXT: add w9, w9, w10
-; NONEON-NOSVE-NEXT: add w10, w12, w11
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w12, w13, w14
-; NONEON-NOSVE-NEXT: add w14, w15, w16
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62]
-; NONEON-NOSVE-NEXT: add w10, w10, w12
-; NONEON-NOSVE-NEXT: add w11, w14, w11
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #46]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w8, w13
-; NONEON-NOSVE-NEXT: add w8, w9, w12
-; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16
-; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_34
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w11, #0x4
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x10
+; NONEON-NOSVE-NEXT: sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT: bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x20
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w10, w11
+; NONEON-NOSVE-NEXT: orr w10, w12, w13
+; NONEON-NOSVE-NEXT: and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT: add w11, w8, w11
+; NONEON-NOSVE-NEXT: add w8, w9, w10
+; NONEON-NOSVE-NEXT: bfi w8, w11, #16, #16
+; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33
; NONEON-NOSVE-NEXT: // %bb.1: // %else
-; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_35
+; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34
; NONEON-NOSVE-NEXT: .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35
; NONEON-NOSVE-NEXT: .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36
; NONEON-NOSVE-NEXT: .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37
; NONEON-NOSVE-NEXT: .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38
; NONEON-NOSVE-NEXT: .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39
; NONEON-NOSVE-NEXT: .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40
; NONEON-NOSVE-NEXT: .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41
; NONEON-NOSVE-NEXT: .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42
; NONEON-NOSVE-NEXT: .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43
; NONEON-NOSVE-NEXT: .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44
; NONEON-NOSVE-NEXT: .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45
; NONEON-NOSVE-NEXT: .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46
; NONEON-NOSVE-NEXT: .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47
; NONEON-NOSVE-NEXT: .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48
; NONEON-NOSVE-NEXT: .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49
; NONEON-NOSVE-NEXT: .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50
; NONEON-NOSVE-NEXT: .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51
; NONEON-NOSVE-NEXT: .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52
; NONEON-NOSVE-NEXT: .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53
; NONEON-NOSVE-NEXT: .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54
; NONEON-NOSVE-NEXT: .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55
; NONEON-NOSVE-NEXT: .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56
; NONEON-NOSVE-NEXT: .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57
; NONEON-NOSVE-NEXT: .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58
; NONEON-NOSVE-NEXT: .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59
; NONEON-NOSVE-NEXT: .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60
; NONEON-NOSVE-NEXT: .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61
; NONEON-NOSVE-NEXT: .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62
; NONEON-NOSVE-NEXT: .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63
; NONEON-NOSVE-NEXT: .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_33
-; NONEON-NOSVE-NEXT: .LBB3_32: // %cond.store61
-; NONEON-NOSVE-NEXT: strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT: .LBB3_33: // %else62
-; NONEON-NOSVE-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT: add sp, sp, #80
+; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT: .LBB3_32: // %else62
; NONEON-NOSVE-NEXT: ret
-; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store
+; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.store
; NONEON-NOSVE-NEXT: strb wzr, [x0]
; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store1
+; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store1
; NONEON-NOSVE-NEXT: strb wzr, [x0, #1]
; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store3
+; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store3
; NONEON-NOSVE-NEXT: strb wzr, [x0, #2]
; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store5
+; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store5
; NONEON-NOSVE-NEXT: strb wzr, [x0, #3]
; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store7
+; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store7
; NONEON-NOSVE-NEXT: strb wzr, [x0, #4]
; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store9
+; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store9
; NONEON-NOSVE-NEXT: strb wzr, [x0, #5]
; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store11
+; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store11
; NONEON-NOSVE-NEXT: strb wzr, [x0, #6]
; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store13
+; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store13
; NONEON-NOSVE-NEXT: strb wzr, [x0, #7]
; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store15
+; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store15
; NONEON-NOSVE-NEXT: strb wzr, [x0, #8]
; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store17
+; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store17
; NONEON-NOSVE-NEXT: strb wzr, [x0, #9]
; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store19
+; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store19
; NONEON-NOSVE-NEXT: strb wzr, [x0, #10]
; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store21
+; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store21
; NONEON-NOSVE-NEXT: strb wzr, [x0, #11]
; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store23
+; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store23
; NONEON-NOSVE-NEXT: strb wzr, [x0, #12]
; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store25
+; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store25
; NONEON-NOSVE-NEXT: strb wzr, [x0, #13]
; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store27
+; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store27
; NONEON-NOSVE-NEXT: strb wzr, [x0, #14]
; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store29
+; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store29
; NONEON-NOSVE-NEXT: strb wzr, [x0, #15]
; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store31
+; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store31
; NONEON-NOSVE-NEXT: strb wzr, [x0, #16]
; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store33
+; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store33
; NONEON-NOSVE-NEXT: strb wzr, [x0, #17]
; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store35
+; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store35
; NONEON-NOSVE-NEXT: strb wzr, [x0, #18]
; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store37
+; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store37
; NONEON-NOSVE-NEXT: strb wzr, [x0, #19]
; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store39
+; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store39
; NONEON-NOSVE-NEXT: strb wzr, [x0, #20]
; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store41
+; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store41
; NONEON-NOSVE-NEXT: strb wzr, [x0, #21]
; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store43
+; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store43
; NONEON-NOSVE-NEXT: strb wzr, [x0, #22]
; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store45
+; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store45
; NONEON-NOSVE-NEXT: strb wzr, [x0, #23]
; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store47
+; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store47
; NONEON-NOSVE-NEXT: strb wzr, [x0, #24]
; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store49
+; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store49
; NONEON-NOSVE-NEXT: strb wzr, [x0, #25]
; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store51
+; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store51
; NONEON-NOSVE-NEXT: strb wzr, [x0, #26]
; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store53
+; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store53
; NONEON-NOSVE-NEXT: strb wzr, [x0, #27]
; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store55
+; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store55
; NONEON-NOSVE-NEXT: strb wzr, [x0, #28]
; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store57
+; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store57
; NONEON-NOSVE-NEXT: strb wzr, [x0, #29]
; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store59
+; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store59
; NONEON-NOSVE-NEXT: strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT: b .LBB3_33
+; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT: strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
ret void
}
@@ -981,89 +820,37 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v16f16:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT: str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17
; NONEON-NOSVE-NEXT: // %bb.1: // %else
; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18
@@ -1096,7 +883,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB7_15: // %else28
; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32
; NONEON-NOSVE-NEXT: .LBB7_16: // %else30
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.store
; NONEON-NOSVE-NEXT: fmov s0, wzr
@@ -1161,7 +948,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.store29
; NONEON-NOSVE-NEXT: fmov s0, wzr
; NONEON-NOSVE-NEXT: str h0, [x0, #30]
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
ret void
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
deleted file mode 100644
index 41868a8c790f1..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN: | llvm-objdump --mattr=-neon,+sme -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN: | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-bfcvt h5, s3
-// CHECK-INST: bfcvt h5, s3
-// CHECK-ENCODING: [0x65,0x40,0x63,0x1e]
-// CHECK-ERROR: instruction requires: bf16 neon or sme
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
deleted file mode 100644
index 138a1fe0bb8e9..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
+++ /dev/null
@@ -1,132 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN: | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-// Scalar FP instructions
-
-fmulx s0, s1, s2
-// CHECK-INST: fmulx s0, s1, s2
-// CHECK-ENCODING: [0x20,0xdc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-fmulx d0, d1, d2
-// CHECK-INST: fmulx d0, d1, d2
-// CHECK-ENCODING: [0x20,0xdc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps s0, s1, s2
-// CHECK-INST: frecps s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps d0, d1, d2
-// CHECK-INST: frecps d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts s0, s1, s2
-// CHECK-INST: frsqrts s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0xa2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts d0, d1, d2
-// CHECK-INST: frsqrts d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0xe2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe s0, s1
-// CHECK-INST: frecpe s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe d0, d1
-// CHECK-INST: frecpe d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx s0, s1
-// CHECK-INST: frecpx s0, s1
-// CHECK-ENCODING: [0x20,0xf8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx d0, d1
-// CHECK-INST: frecpx d0, d1
-// CHECK-ENCODING: [0x20,0xf8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte s0, s1
-// CHECK-INST: frsqrte s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte d0, d1
-// CHECK-INST: frsqrte d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-// Vector to GPR integer move instructions
-
-smov w0, v0.b[0]
-// CHECK-INST: smov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.b[0]
-// CHECK-INST: smov x0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov w0, v0.h[0]
-// CHECK-INST: smov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.h[0]
-// CHECK-INST: smov x0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.s[0]
-// CHECK-INST: smov x0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x2c,0x04,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.b[0]
-// CHECK-INST: umov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x3c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.h[0]
-// CHECK-INST: umov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x3c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-// Aliases
-
-mov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-mov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
More information about the llvm-commits
mailing list