[llvm] [AArch64] Let patterns for NEON instructions check runtime mode. (PR #95560)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 14 09:09:11 PDT 2024
https://github.com/sdesmalen-arm created https://github.com/llvm/llvm-project/pull/95560
I've had to change:
HasNEON -> IsNeonAvailable (available only in non-streaming mode)
HasNEONorSME -> HasNEON (available in either mode)
In contrast, the Predicate HasSVE (and related) don't need a similar change because all patterns predicated with HasSVE use scalable vector types, and in AArch64ISelLowering we've already made sure that none of those type and operationss are legal if the function is not in the right mode (if it only has +sme, without +sve).
>From 3490381a6ca1fe95c411ace8df3af587efc5cb93 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 14 Jun 2024 16:14:23 +0100
Subject: [PATCH] [AArch64] Let patterns for NEON instructions check runtime
mode.
I've had to change:
HasNEON -> IsNeonAvailable (available only in non-streaming mode)
HasNEONorSME -> HasNEON (available in either mode)
In contrast, the Predicate HasSVE (and related) don't need a similar change
because all patterns predicated with HasSVE use scalable vector types, and in
AArch64ISelLowering we've already made sure that none of those type and
operationss are legal if the function is not in the right mode (if it only has
+sme, without +sve).
---
.../Target/AArch64/AArch64ISelLowering.cpp | 3 +-
.../lib/Target/AArch64/AArch64InstrFormats.td | 114 ++--
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 98 ++-
...streaming-mode-fixed-length-masked-load.ll | 422 ++++---------
...treaming-mode-fixed-length-masked-store.ll | 561 ++++++------------
.../MC/AArch64/SME/streaming-mode-neon-bf16.s | 16 -
.../test/MC/AArch64/SME/streaming-mode-neon.s | 132 -----
7 files changed, 388 insertions(+), 958 deletions(-)
delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon.s
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac6f1e07c4184..7cbae90ef3ca4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22185,7 +22185,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
SmallVector<SDValue, 16> MaskConstants;
- if (VecVT == MVT::v16i8) {
+ if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+ VecVT == MVT::v16i8) {
// v16i8 is a special case, as we have 16 entries but only 8 positional bits
// per entry. We split it into two halves, apply the mask, zip the halves to
// create 8x 16-bit values, and the perform the vector reduce.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1f437d0ed6f8d..d67aabeee0010 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5762,7 +5762,7 @@ multiclass FPMoveImmediate<string asm> {
// AdvSIMD
//----------------------------------------------------------------------------
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
//----------------------------------------------------------------------------
// AdvSIMD three register vector instructions
@@ -5966,14 +5966,14 @@ multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
asm, ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
@@ -5989,14 +5989,14 @@ let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
string asm,
SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
@@ -6011,7 +6011,7 @@ multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4f16 V64:$dst),
@@ -6020,7 +6020,7 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
asm, ".8h",
[(set (v8f16 V128:$dst),
(OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$dst),
@@ -6480,14 +6480,14 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode,
int fpexceptions = 1> {
let mayRaiseFPException = fpexceptions, Uses = !if(fpexceptions,[FPCR],[]<Register>) in {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
@@ -6538,14 +6538,14 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
@@ -6560,14 +6560,14 @@ multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
@@ -6720,14 +6720,14 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
string asm, SDNode OpNode> {
let mayRaiseFPException = 1, Uses = [FPCR] in {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
asm, ".4h", "0.0",
v4i16, v4f16, OpNode>;
def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
asm, ".8h", "0.0",
v8i16, v8f16, OpNode>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
asm, ".2s", "0.0",
v2i32, v2f32, OpNode>;
@@ -6739,7 +6739,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
v2i64, v2f64, OpNode>;
}
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
@@ -6751,7 +6751,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
@@ -7394,7 +7394,7 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> {
multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag,
- Predicate pred = HasNEON> {
+ Predicate pred = IsNEONAvailable> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPCR] in {
let Predicates = [pred] in {
def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
@@ -7419,10 +7419,10 @@ multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
[]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
}
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -7571,7 +7571,7 @@ multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in {
def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
}
}
@@ -7580,7 +7580,7 @@ multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
}
@@ -7603,7 +7603,7 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
- Predicate pred = HasNEON> {
+ Predicate pred = IsNEONAvailable> {
let Predicates = [pred] in {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
@@ -7620,7 +7620,7 @@ multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
}
@@ -7698,7 +7698,7 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
asm, ".2h">;
}
@@ -7763,14 +7763,14 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
SDPatternOperator intOp> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
asm, ".4h",
[(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
asm, ".8h",
[(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
asm, ".4s",
[(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -7880,7 +7880,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
multiclass SMov {
// SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEON] in {
def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -7927,7 +7927,7 @@ multiclass SMov {
multiclass UMov {
// UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEON] in {
def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -8816,7 +8816,7 @@ multiclass SIMDThreeSameVectorFP8DOT2Index<string asm> {
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let mayRaiseFPException = 1, Uses = [FPCR] in {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
V64, V64,
V128_lo, VectorIndexH,
@@ -8842,7 +8842,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
@@ -8880,7 +8880,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
let Inst{21} = 0;
}
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
FPR16Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h",
@@ -8893,7 +8893,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -8920,7 +8920,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
}
} // mayRaiseFPException = 1, Uses = [FPCR]
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : Pat<(f16 (OpNode
(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
(f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))),
@@ -8928,7 +8928,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128:$Rm, VectorIndexH:$idx)>;
}
- let Predicates = [HasNEON] in {
+ let Predicates = [IsNEONAvailable] in {
def : Pat<(f32 (OpNode
(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
(f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))),
@@ -8944,7 +8944,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
}
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
// Patterns for f16: DUPLANE, DUP scalar and vector_extract.
def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
(AArch64duplane16 (v8f16 V128_lo:$Rm),
@@ -8970,7 +8970,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
V128_lo:$Rm, VectorIndexH:$idx)>;
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
@@ -9021,7 +9021,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h", []> {
@@ -9040,7 +9040,7 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
V128, VectorIndexS,
@@ -9068,7 +9068,7 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
let Inst{21} = 0;
}
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
FPR16Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h", []> {
@@ -9077,7 +9077,7 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -9673,12 +9673,12 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
FPR16, FPR16, vecshiftR16, asm, []> {
let Inst{19-16} = imm{3-0};
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
@@ -9860,7 +9860,7 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16,
asm, ".4h", ".4h",
@@ -9876,7 +9876,7 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
bits<4> imm;
let Inst{19-16} = imm;
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
@@ -9904,7 +9904,7 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
- let Predicates = [HasNEON, HasFullFP16] in {
+ let Predicates = [IsNEONAvailable, HasFullFP16] in {
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16,
asm, ".4h", ".4h",
@@ -9920,7 +9920,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
bits<4> imm;
let Inst{19-16} = imm;
}
- } // Predicates = [HasNEON, HasFullFP16]
+ } // Predicates = [IsNEONAvailable, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
@@ -11191,13 +11191,13 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
}
-} // end of 'let Predicates = [HasNEON]'
+} // end of 'let Predicates = [IsNEONAvailable]'
//----------------------------------------------------------------------------
// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
//----------------------------------------------------------------------------
-let Predicates = [HasNEON, HasRDM] in {
+let Predicates = [IsNEONAvailable, HasRDM] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
@@ -11351,7 +11351,7 @@ class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
//8.3 CompNum - Floating-point complex number support
multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
string asm, SDPatternOperator OpNode>{
- let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype,
asm, ".4h",
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
@@ -11367,7 +11367,7 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
(i32 rottype:$rot)))]>;
}
- let Predicates = [HasComplxNum, HasNEON] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable] in {
def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype,
asm, ".2s",
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
@@ -11423,7 +11423,7 @@ class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
Operand rottype, string asm,
SDPatternOperator OpNode> {
- let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64,
rottype, asm, ".4h",
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
@@ -11439,7 +11439,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
(i32 rottype:$rot)))]>;
}
- let Predicates = [HasComplxNum, HasNEON] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable] in {
def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64,
rottype, asm, ".2s",
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
@@ -11505,7 +11505,7 @@ class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
// classes.
multiclass SIMDIndexedTiedComplexHSD<bit opc1, bit opc2, Operand rottype,
string asm> {
- let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable, HasFullFP16] in {
def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64,
V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h",
".4h", ".h", []> {
@@ -11521,9 +11521,9 @@ multiclass SIMDIndexedTiedComplexHSD<bit opc1, bit opc2, Operand rottype,
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
- } // Predicates = HasComplxNum, HasNEON, HasFullFP16]
+ } // Predicates = HasComplxNum, IsNEONAvailable, HasFullFP16]
- let Predicates = [HasComplxNum, HasNEON] in {
+ let Predicates = [HasComplxNum, IsNEONAvailable] in {
def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2,
V128, V128, V128, VectorIndexD, rottype, asm, ".4s",
".4s", ".4s", ".s", []> {
@@ -11531,7 +11531,7 @@ multiclass SIMDIndexedTiedComplexHSD<bit opc1, bit opc2, Operand rottype,
let Inst{11} = idx{0};
let Inst{21} = 0;
}
- } // Predicates = [HasComplxNum, HasNEON]
+ } // Predicates = [HasComplxNum, IsNEONAvailable]
}
//----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index dd54520c8ddad..39439eb039af4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -108,6 +108,10 @@ def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">,
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+// Not all NEON instructions are available in Streaming SVE execution mode.
+// In contrast to HasNEON, IsNEONAvailable also checks the runtime mode.
+def IsNEONAvailable : Predicate<"Subtarget->isNeonAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
@@ -234,12 +238,6 @@ def HasSMEF16F16orSMEF8F16
AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
"sme-f16f16 or sme-f8f16">;
-// A subset of NEON instructions are legal in Streaming SVE execution mode,
-// they should be enabled if either has been specified.
-def HasNEONorSME
- : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
- AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
- "neon or sme">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -1294,7 +1292,7 @@ defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", AArch64udot>;
}
// ARMv8.6-A BFloat
-let Predicates = [HasNEON, HasBF16] in {
+let Predicates = [IsNEONAvailable, HasBF16] in {
defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">;
defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
@@ -1326,7 +1324,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
VectorIndexS:$idx)>;
}
-let Predicates = [HasNEONorSME, HasBF16] in {
+let Predicates = [HasNEON, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Round FP32 to BF16.
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -1367,7 +1365,7 @@ defm SUDOTlane : SIMDSUDOTIndex;
}
// ARMv8.2-A FP16 Fused Multiply-Add Long
-let Predicates = [HasNEON, HasFP16FML] in {
+let Predicates = [IsNEONAvailable, HasFP16FML] in {
defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
defm FMLSL : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
defm FMLAL2 : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
@@ -1503,7 +1501,7 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
"fcadd", null_frag>;
defm FCMLA : SIMDIndexedTiedComplexHSD<0, 1, complexrotateop, "fcmla">;
-let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+let Predicates = [HasComplxNum, IsNEONAvailable, HasFullFP16] in {
def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
(FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>;
def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
@@ -1514,7 +1512,7 @@ let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
(FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
}
-let Predicates = [HasComplxNum, HasNEON] in {
+let Predicates = [HasComplxNum, IsNEONAvailable] in {
def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
(FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
@@ -1550,7 +1548,7 @@ multiclass FCMLA_LANE_PATS<ValueType ty, DAGOperand Reg, dag RHSDup> {
}
-let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+let Predicates = [HasComplxNum, IsNEONAvailable, HasFullFP16] in {
defm : FCMLA_PATS<v4f16, V64>;
defm : FCMLA_PATS<v8f16, V128>;
@@ -1559,7 +1557,7 @@ let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
defm : FCMLA_LANE_PATS<v8f16, V128,
(v8f16 (bitconvert (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))>;
}
-let Predicates = [HasComplxNum, HasNEON] in {
+let Predicates = [HasComplxNum, IsNEONAvailable] in {
defm : FCMLA_PATS<v2f32, V64>;
defm : FCMLA_PATS<v4f32, V128>;
defm : FCMLA_PATS<v2f64, V128>;
@@ -4858,7 +4856,7 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
// Here we handle first -(a + b*c) for FNMADD:
-let Predicates = [HasNEON, HasFullFP16] in
+let Predicates = [IsNEONAvailable, HasFullFP16] in
def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)),
(FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
@@ -4870,7 +4868,7 @@ def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
// Now it's time for "(-a) + (-b)*c"
-let Predicates = [HasNEON, HasFullFP16] in
+let Predicates = [IsNEONAvailable, HasFullFP16] in
def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))),
(FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
@@ -5127,7 +5125,7 @@ defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
// Aliases for MVN -> NOT.
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
(NOTv8i8 V64:$Vd, V64:$Vn)>;
def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
@@ -5283,11 +5281,11 @@ foreach VT = [ v8i8, v16i8, v4i16, v8i16, v2i32, v4i32, v2i64 ] in {
def : Pat<(vnot (AArch64cmeqz VT:$Rn)), (!cast<Instruction>("CMTST"#VT) VT:$Rn, VT:$Rn)>;
}
defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
foreach VT = [ v2f32, v4f32, v2f64 ] in
def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
}
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
foreach VT = [ v4f16, v8f16 ] in
def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
}
@@ -5499,7 +5497,7 @@ defm : SelectSetCCZeroLHS<setge, "CMLE">;
defm : SelectSetCCZeroLHS<setlt, "CMGT">;
defm : SelectSetCCZeroLHS<setle, "CMGE">;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
@@ -5606,7 +5604,7 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|cmlt.2d\t$dst, $src1, $src2}",
(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
"|fcmle.4h\t$dst, $src1, $src2}",
(FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -5624,7 +5622,7 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmle.2d\t$dst, $src1, $src2}",
(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
"|fcmlt.4h\t$dst, $src1, $src2}",
(FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -5642,7 +5640,7 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmlt.2d\t$dst, $src1, $src2}",
(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
"|facle.4h\t$dst, $src1, $src2}",
(FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -5660,7 +5658,7 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
"|facle.2d\t$dst, $src1, $src2}",
(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
"|faclt.4h\t$dst, $src1, $src2}",
(FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -5693,10 +5691,10 @@ defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
}
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
}
@@ -5707,9 +5705,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
-defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
-defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEON>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEON>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEON>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5738,9 +5736,9 @@ let Predicates = [HasRDM] in {
defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
int_aarch64_neon_fmulx,
- [HasNEONorSME]>;
+ [HasNEON]>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"cmls $dst, $src1, $src2",
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmle $dst, $src1, $src2",
@@ -5812,9 +5810,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>;
-defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>;
-defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEON>;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEON>;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEON>;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5833,7 +5831,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
(CMLTv1i64rz V64:$Rn)>;
// Round FP64 to BF16.
-let Predicates = [HasNEONorSME, HasBF16] in
+let Predicates = [HasNEON, HasBF16] in
def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
(BFCVT (FCVTXNv1i64 $Rn))>;
@@ -6036,7 +6034,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
-} // let Predicates = [HasNEON]
+} // let Predicates = [IsNEONAvailable]
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
@@ -7239,7 +7237,7 @@ defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
// AdvSIMD ORR
defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
@@ -7271,14 +7269,14 @@ def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
"fmov", ".4s",
[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
"fmov", ".4h",
[(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
"fmov", ".8h",
[(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-} // Predicates = [HasNEON, HasFullFP16]
+} // Predicates = [IsNEONAvailable, HasFullFP16]
// AdvSIMD MOVI
@@ -7335,7 +7333,7 @@ def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
// Using the MOVI to materialize fp constants.
def : Pat<(f32 fpimm32SIMDModImmType4:$in),
(EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in),
@@ -7343,7 +7341,7 @@ let Predicates = [HasNEON] in {
ssub)>;
}
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -7389,7 +7387,7 @@ def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -7716,7 +7714,7 @@ defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>;
defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : Pat<(v2f32 (sint_to_fp (v2i32 (AArch64vashr_exact v2i32:$Vn, i32:$shift)))),
(SCVTFv2i32_shift $Vn, vecshiftR32:$shift)>;
@@ -7727,7 +7725,7 @@ def : Pat<(v2f64 (sint_to_fp (v2i64 (AArch64vashr_exact v2i64:$Vn, i32:$shift)))
(SCVTFv2i64_shift $Vn, vecshiftR64:$shift)>;
}
-let Predicates = [HasNEON, HasFullFP16] in {
+let Predicates = [IsNEONAvailable, HasFullFP16] in {
def : Pat<(v4f16 (sint_to_fp (v4i16 (AArch64vashr_exact v4i16:$Vn, i32:$shift)))),
(SCVTFv4i16_shift $Vn, vecshiftR16:$shift)>;
@@ -7912,7 +7910,7 @@ def : Pat<(v2i64 (zext (v2i32 (extract_high_v4i32 (v4i32 V128:$Rn)) ))),
def : Pat<(v2i64 (sext (v2i32 (extract_high_v4i32 (v4i32 V128:$Rn)) ))),
(SSHLLv4i32_shift V128:$Rn, (i32 0))>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
// Vector shift sxtl aliases
def : InstAlias<"sxtl.8h $dst, $src1",
(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
@@ -7994,7 +7992,7 @@ def neg_f16 :
def : Pat<(f16 (fneg (f16 FPR16:$Rn))), (f16 (neg_f16 (f16 FPR16:$Rn)))>;
def : Pat<(bf16 (fneg (bf16 FPR16:$Rn))), (bf16 (neg_f16 (bf16 FPR16:$Rn)))>;
-let Predicates = [HasNEON] in {
+let Predicates = [IsNEONAvailable] in {
def : Pat<(v4f16 (fabs (v4f16 V64:$Rn))), (v4f16 (BICv4i16 (v4f16 V64:$Rn), (i32 128), (i32 8)))>;
def : Pat<(v4bf16 (fabs (v4bf16 V64:$Rn))), (v4bf16 (BICv4i16 (v4bf16 V64:$Rn), (i32 128), (i32 8)))>;
def : Pat<(v8f16 (fabs (v8f16 V128:$Rn))), (v8f16 (BICv8i16 (v8f16 V128:$Rn), (i32 128), (i32 8)))>;
@@ -8033,7 +8031,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
dsub)),
0),
ssub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, IsNEONAvailable]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -8054,7 +8052,7 @@ class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
hsub),
0),
ssub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, IsNEONAvailable]>;
def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -8088,7 +8086,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
dsub)),
0),
dsub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, IsNEONAvailable]>;
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -8108,7 +8106,7 @@ class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
ssub),
0),
dsub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, IsNEONAvailable]>;
def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
(LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
@@ -9749,7 +9747,7 @@ let Predicates = [HasRCPC3] in {
def LDAPRXpost: BaseLRCPC3IntegerLoadStore<0b11, 0b11, (outs GPR64sp:$wback, GPR64:$Rt), (ins GPR64sp:$Rn), "ldapr", "\t$Rt, [$Rn], #8", "$Rn = $wback">;
}
-let Predicates = [HasRCPC3, HasNEON] in {
+let Predicates = [HasRCPC3, IsNEONAvailable] in {
// size opc regtype
defm STLURb: LRCPC3NEONLoadStoreUnscaledOffset<0b00, 0b00, FPR8 , (outs), (ins FPR8 :$Rt, GPR64sp:$Rn, simm9:$simm), "stlur">;
defm STLURh: LRCPC3NEONLoadStoreUnscaledOffset<0b01, 0b00, FPR16 , (outs), (ins FPR16 :$Rt, GPR64sp:$Rn, simm9:$simm), "stlur">;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index be335c697707d..029f5d0b8a12a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -315,92 +315,40 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #1024
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT: sub sp, sp, #992
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1008
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
; NONEON-NOSVE-NEXT: str q0, [sp, #976]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #995]
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #979]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #977]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #980]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #981]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #982]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp, #976]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1012]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1014]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1016]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1018]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1020]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #983]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
; NONEON-NOSVE-NEXT: add x9, sp, #720
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB2_2
; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load
; NONEON-NOSVE-NEXT: ldrb w10, [x0]
@@ -481,7 +429,7 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: strb w8, [sp, #47]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB2_19: // %else44
-; NONEON-NOSVE-NEXT: add sp, sp, #1024
+; NONEON-NOSVE-NEXT: add sp, sp, #992
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load4
@@ -806,166 +754,62 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #2064
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT: sub sp, sp, #2000
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2016
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2216]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2152]
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2272]
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #2176]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #2160]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2024]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2264]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2016]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2256]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2031]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2248]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2030]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2240]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2029]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2232]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2028]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2224]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2027]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2208]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2026]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2200]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2025]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2192]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2023]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2184]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #2096]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #2104]
+; NONEON-NOSVE-NEXT: sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT: ldr w10, [sp, #2112]
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #2088]
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #2120]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2168]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x20
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #2021]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2020]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x8
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2019]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2088]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2018]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2136]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2017]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2144]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2008]
-; NONEON-NOSVE-NEXT: and w8, w1, #0x1
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #2104]
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2000]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #2080]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #2016]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2128]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2015]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #2120]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2014]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #2112]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x20
+; NONEON-NOSVE-NEXT: ldr w13, [sp, #2136]
; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #2013]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #2096]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x10
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #2012]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2011]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x4
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2010]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2009]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x80
-; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2007]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2006]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2005]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2004]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x8
+; NONEON-NOSVE-NEXT: ldr w14, [sp, #2144]
+; NONEON-NOSVE-NEXT: ldr w16, [sp, #2016]
+; NONEON-NOSVE-NEXT: and w8, w8, #0x2
+; NONEON-NOSVE-NEXT: and w9, w9, #0x4
+; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #2128]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w11, #0x10
+; NONEON-NOSVE-NEXT: sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: and w9, w11, #0x20
+; NONEON-NOSVE-NEXT: sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT: and w10, w10, #0x40
+; NONEON-NOSVE-NEXT: sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w12, #0x8
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2003]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2002]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2001]
-; NONEON-NOSVE-NEXT: str q0, [sp, #2048]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #2000]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2050]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2048]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2052]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2054]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2056]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2058]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2060]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w12, w13
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: add w9, w9, w10
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: str q0, [sp, #2032]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2034]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2032]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2036]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #2038]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #2040]
-; NONEON-NOSVE-NEXT: ldrh w16, [sp, #2042]
-; NONEON-NOSVE-NEXT: add w10, w12, w11
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2044]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2062]
-; NONEON-NOSVE-NEXT: add w13, w13, w14
-; NONEON-NOSVE-NEXT: add w14, w15, w16
-; NONEON-NOSVE-NEXT: add w10, w10, w13
-; NONEON-NOSVE-NEXT: add w11, w14, w11
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2046]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w8, w12
-; NONEON-NOSVE-NEXT: add w8, w9, w13
+; NONEON-NOSVE-NEXT: and w11, w11, #0x4
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x10
+; NONEON-NOSVE-NEXT: sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w13, #0x20
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w10, w11
+; NONEON-NOSVE-NEXT: orr w10, w12, w13
+; NONEON-NOSVE-NEXT: and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT: add w11, w8, w11
+; NONEON-NOSVE-NEXT: add w8, w9, w10
; NONEON-NOSVE-NEXT: adrp x9, .LCPI3_0
-; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT: bfi w8, w11, #16, #16
; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI3_0]
; NONEON-NOSVE-NEXT: add x9, sp, #1744
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB3_2
@@ -1082,7 +926,7 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
; NONEON-NOSVE-NEXT: strb w8, [sp, #47]
; NONEON-NOSVE-NEXT: ldr q1, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB3_35: // %else92
-; NONEON-NOSVE-NEXT: add sp, sp, #2064
+; NONEON-NOSVE-NEXT: add sp, sp, #2000
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load4
@@ -1995,94 +1839,42 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-LABEL: masked_load_v16f16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: sub sp, sp, #1024
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT: sub sp, sp, #992
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1008
; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
; NONEON-NOSVE-NEXT: str q0, [sp, #976]
; NONEON-NOSVE-NEXT: adrp x9, .LCPI7_0
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #995]
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #979]
+; NONEON-NOSVE-NEXT: ldrb w11, [sp, #977]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #980]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #981]
+; NONEON-NOSVE-NEXT: ldrb w15, [sp, #982]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #976]
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #983]
+; NONEON-NOSVE-NEXT: and w11, w11, #0x2
+; NONEON-NOSVE-NEXT: and w13, w13, #0x10
+; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w14, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w13
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w11, w8
+; NONEON-NOSVE-NEXT: orr w11, w12, w13
; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT: orr w8, w8, w11
+; NONEON-NOSVE-NEXT: and w10, w10, #0xffffff80
; NONEON-NOSVE-NEXT: add x9, sp, #720
-; NONEON-NOSVE-NEXT: str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1008]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1012]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1014]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1016]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1018]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #1020]
-; NONEON-NOSVE-NEXT: add w8, w10, w8
-; NONEON-NOSVE-NEXT: add w10, w11, w12
-; NONEON-NOSVE-NEXT: add w11, w13, w14
-; NONEON-NOSVE-NEXT: add w8, w8, w10
-; NONEON-NOSVE-NEXT: add w10, w11, w15
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1022]
; NONEON-NOSVE-NEXT: add w8, w8, w10
-; NONEON-NOSVE-NEXT: add w8, w8, w11
; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB7_2
; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load
; NONEON-NOSVE-NEXT: fmov s0, wzr
@@ -2157,7 +1949,7 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: str h1, [sp, #46]
; NONEON-NOSVE-NEXT: ldr q1, [sp, #32]
; NONEON-NOSVE-NEXT: .LBB7_19: // %else44
-; NONEON-NOSVE-NEXT: add sp, sp, #1024
+; NONEON-NOSVE-NEXT: add sp, sp, #992
; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index a79ce9db9abfd..5eba12a048cf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -172,89 +172,37 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v16i8:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT: str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17
; NONEON-NOSVE-NEXT: // %bb.1: // %else
; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18
@@ -287,7 +235,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB2_15: // %else28
; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32
; NONEON-NOSVE-NEXT: .LBB2_16: // %else30
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.store
; NONEON-NOSVE-NEXT: strb wzr, [x0]
@@ -336,7 +284,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16
; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.store29
; NONEON-NOSVE-NEXT: strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
ret void
@@ -421,328 +369,219 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v32i8:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: sub sp, sp, #80
-; NONEON-NOSVE-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT: .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #216]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #152]
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #272]
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #176]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #160]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #264]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #256]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #248]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #240]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #232]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x8
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #224]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #208]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #200]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #192]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #22]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #168]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x20
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #21]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x8
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #80]
; NONEON-NOSVE-NEXT: ldr w9, [sp, #88]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #136]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #144]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w1, #0x1
-; NONEON-NOSVE-NEXT: ldr w11, [sp, #104]
-; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp]
-; NONEON-NOSVE-NEXT: ldr w12, [sp, #80]
-; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x80
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #128]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT: ldr w10, [sp, #120]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT: and w9, w9, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT: strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT: sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT: ldr w10, [sp, #96]
+; NONEON-NOSVE-NEXT: ldr w12, [sp, #104]
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #72]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #96]
-; NONEON-NOSVE-NEXT: and w10, w10, #0x10
-; NONEON-NOSVE-NEXT: zip1 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT: strb w10, [sp, #12]
-; NONEON-NOSVE-NEXT: strb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: and w8, w11, #0x4
-; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x2
-; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x80
-; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x40
-; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x20
-; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x10
-; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x8
+; NONEON-NOSVE-NEXT: ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT: and w8, w8, #0x2
+; NONEON-NOSVE-NEXT: and w9, w9, #0x4
+; NONEON-NOSVE-NEXT: ldr w16, [sp]
+; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT: ldr w11, [sp, #112]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x8
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w12, #0x10
+; NONEON-NOSVE-NEXT: sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT: and w12, w12, #0x8
+; NONEON-NOSVE-NEXT: and w9, w11, #0x20
+; NONEON-NOSVE-NEXT: and w10, w10, #0x40
+; NONEON-NOSVE-NEXT: sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: and w8, w9, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #2]
-; NONEON-NOSVE-NEXT: and w8, w10, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: ldr q0, [sp]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #52]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #54]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #56]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #58]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w15, [sp, #40]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w12, w13
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w16, [sp, #42]
-; NONEON-NOSVE-NEXT: add w9, w9, w10
-; NONEON-NOSVE-NEXT: add w10, w12, w11
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w12, w13, w14
-; NONEON-NOSVE-NEXT: add w14, w15, w16
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62]
-; NONEON-NOSVE-NEXT: add w10, w10, w12
-; NONEON-NOSVE-NEXT: add w11, w14, w11
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #46]
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w8, w13
-; NONEON-NOSVE-NEXT: add w8, w9, w12
-; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16
-; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_34
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w11, #0x4
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x10
+; NONEON-NOSVE-NEXT: sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT: bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT: orr w11, w11, w12
+; NONEON-NOSVE-NEXT: and w12, w13, #0x20
+; NONEON-NOSVE-NEXT: and w13, w15, #0x40
+; NONEON-NOSVE-NEXT: sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT: orr w9, w10, w11
+; NONEON-NOSVE-NEXT: orr w10, w12, w13
+; NONEON-NOSVE-NEXT: and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT: orr w9, w9, w10
+; NONEON-NOSVE-NEXT: and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT: add w11, w8, w11
+; NONEON-NOSVE-NEXT: add w8, w9, w10
+; NONEON-NOSVE-NEXT: bfi w8, w11, #16, #16
+; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33
; NONEON-NOSVE-NEXT: // %bb.1: // %else
-; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_35
+; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34
; NONEON-NOSVE-NEXT: .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35
; NONEON-NOSVE-NEXT: .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36
; NONEON-NOSVE-NEXT: .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37
; NONEON-NOSVE-NEXT: .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38
; NONEON-NOSVE-NEXT: .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39
; NONEON-NOSVE-NEXT: .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40
; NONEON-NOSVE-NEXT: .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41
; NONEON-NOSVE-NEXT: .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42
; NONEON-NOSVE-NEXT: .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43
; NONEON-NOSVE-NEXT: .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44
; NONEON-NOSVE-NEXT: .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45
; NONEON-NOSVE-NEXT: .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46
; NONEON-NOSVE-NEXT: .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47
; NONEON-NOSVE-NEXT: .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48
; NONEON-NOSVE-NEXT: .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49
; NONEON-NOSVE-NEXT: .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50
; NONEON-NOSVE-NEXT: .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51
; NONEON-NOSVE-NEXT: .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52
; NONEON-NOSVE-NEXT: .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53
; NONEON-NOSVE-NEXT: .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54
; NONEON-NOSVE-NEXT: .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55
; NONEON-NOSVE-NEXT: .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56
; NONEON-NOSVE-NEXT: .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57
; NONEON-NOSVE-NEXT: .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58
; NONEON-NOSVE-NEXT: .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59
; NONEON-NOSVE-NEXT: .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60
; NONEON-NOSVE-NEXT: .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61
; NONEON-NOSVE-NEXT: .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62
; NONEON-NOSVE-NEXT: .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63
; NONEON-NOSVE-NEXT: .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_33
-; NONEON-NOSVE-NEXT: .LBB3_32: // %cond.store61
-; NONEON-NOSVE-NEXT: strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT: .LBB3_33: // %else62
-; NONEON-NOSVE-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT: add sp, sp, #80
+; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT: .LBB3_32: // %else62
; NONEON-NOSVE-NEXT: ret
-; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store
+; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.store
; NONEON-NOSVE-NEXT: strb wzr, [x0]
; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store1
+; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store1
; NONEON-NOSVE-NEXT: strb wzr, [x0, #1]
; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store3
+; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store3
; NONEON-NOSVE-NEXT: strb wzr, [x0, #2]
; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store5
+; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store5
; NONEON-NOSVE-NEXT: strb wzr, [x0, #3]
; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store7
+; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store7
; NONEON-NOSVE-NEXT: strb wzr, [x0, #4]
; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store9
+; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store9
; NONEON-NOSVE-NEXT: strb wzr, [x0, #5]
; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store11
+; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store11
; NONEON-NOSVE-NEXT: strb wzr, [x0, #6]
; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store13
+; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store13
; NONEON-NOSVE-NEXT: strb wzr, [x0, #7]
; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store15
+; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store15
; NONEON-NOSVE-NEXT: strb wzr, [x0, #8]
; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store17
+; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store17
; NONEON-NOSVE-NEXT: strb wzr, [x0, #9]
; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store19
+; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store19
; NONEON-NOSVE-NEXT: strb wzr, [x0, #10]
; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store21
+; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store21
; NONEON-NOSVE-NEXT: strb wzr, [x0, #11]
; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store23
+; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store23
; NONEON-NOSVE-NEXT: strb wzr, [x0, #12]
; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store25
+; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store25
; NONEON-NOSVE-NEXT: strb wzr, [x0, #13]
; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store27
+; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store27
; NONEON-NOSVE-NEXT: strb wzr, [x0, #14]
; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store29
+; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store29
; NONEON-NOSVE-NEXT: strb wzr, [x0, #15]
; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store31
+; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store31
; NONEON-NOSVE-NEXT: strb wzr, [x0, #16]
; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store33
+; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store33
; NONEON-NOSVE-NEXT: strb wzr, [x0, #17]
; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store35
+; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store35
; NONEON-NOSVE-NEXT: strb wzr, [x0, #18]
; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store37
+; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store37
; NONEON-NOSVE-NEXT: strb wzr, [x0, #19]
; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store39
+; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store39
; NONEON-NOSVE-NEXT: strb wzr, [x0, #20]
; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store41
+; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store41
; NONEON-NOSVE-NEXT: strb wzr, [x0, #21]
; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store43
+; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store43
; NONEON-NOSVE-NEXT: strb wzr, [x0, #22]
; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store45
+; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store45
; NONEON-NOSVE-NEXT: strb wzr, [x0, #23]
; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store47
+; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store47
; NONEON-NOSVE-NEXT: strb wzr, [x0, #24]
; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store49
+; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store49
; NONEON-NOSVE-NEXT: strb wzr, [x0, #25]
; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store51
+; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store51
; NONEON-NOSVE-NEXT: strb wzr, [x0, #26]
; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store53
+; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store53
; NONEON-NOSVE-NEXT: strb wzr, [x0, #27]
; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store55
+; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store55
; NONEON-NOSVE-NEXT: strb wzr, [x0, #28]
; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store57
+; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store57
; NONEON-NOSVE-NEXT: strb wzr, [x0, #29]
; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store59
+; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store59
; NONEON-NOSVE-NEXT: strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT: b .LBB3_33
+; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT: strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
ret void
}
@@ -982,89 +821,37 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
;
; NONEON-NOSVE-LABEL: masked_store_v16f16:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp]
-; NONEON-NOSVE-NEXT: and w8, w8, #0x1
-; NONEON-NOSVE-NEXT: strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x80
-; NONEON-NOSVE-NEXT: strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x40
-; NONEON-NOSVE-NEXT: strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x20
-; NONEON-NOSVE-NEXT: strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x10
-; NONEON-NOSVE-NEXT: strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x8
-; NONEON-NOSVE-NEXT: strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT: str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6]
; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT: ldrb w11, [sp]
+; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1
; NONEON-NOSVE-NEXT: and w8, w8, #0x4
-; NONEON-NOSVE-NEXT: strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT: and w8, w8, #0x2
-; NONEON-NOSVE-NEXT: strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT: str q0, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT: add w8, w9, w8
-; NONEON-NOSVE-NEXT: add w9, w10, w11
-; NONEON-NOSVE-NEXT: add w10, w12, w13
-; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w9, w10, w14
-; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT: and w9, w9, #0x8
+; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w9
+; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT: and w10, w10, #0x2
+; NONEON-NOSVE-NEXT: and w12, w12, #0x10
+; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT: and w11, w13, #0x20
+; NONEON-NOSVE-NEXT: orr w8, w8, w12
+; NONEON-NOSVE-NEXT: and w12, w14, #0x40
+; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT: orr w8, w10, w8
+; NONEON-NOSVE-NEXT: orr w10, w11, w12
+; NONEON-NOSVE-NEXT: orr w8, w8, w10
+; NONEON-NOSVE-NEXT: and w9, w9, #0xffffff80
; NONEON-NOSVE-NEXT: add w8, w8, w9
-; NONEON-NOSVE-NEXT: add w8, w8, w10
; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17
; NONEON-NOSVE-NEXT: // %bb.1: // %else
; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18
@@ -1097,7 +884,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB7_15: // %else28
; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32
; NONEON-NOSVE-NEXT: .LBB7_16: // %else30
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.store
; NONEON-NOSVE-NEXT: fmov s0, wzr
@@ -1162,7 +949,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.store29
; NONEON-NOSVE-NEXT: fmov s0, wzr
; NONEON-NOSVE-NEXT: str h0, [x0, #30]
-; NONEON-NOSVE-NEXT: add sp, sp, #48
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
ret void
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
deleted file mode 100644
index 41868a8c790f1..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN: | llvm-objdump --mattr=-neon,+sme -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN: | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-bfcvt h5, s3
-// CHECK-INST: bfcvt h5, s3
-// CHECK-ENCODING: [0x65,0x40,0x63,0x1e]
-// CHECK-ERROR: instruction requires: bf16 neon or sme
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
deleted file mode 100644
index 138a1fe0bb8e9..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
+++ /dev/null
@@ -1,132 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN: | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-// Scalar FP instructions
-
-fmulx s0, s1, s2
-// CHECK-INST: fmulx s0, s1, s2
-// CHECK-ENCODING: [0x20,0xdc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-fmulx d0, d1, d2
-// CHECK-INST: fmulx d0, d1, d2
-// CHECK-ENCODING: [0x20,0xdc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps s0, s1, s2
-// CHECK-INST: frecps s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps d0, d1, d2
-// CHECK-INST: frecps d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts s0, s1, s2
-// CHECK-INST: frsqrts s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0xa2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts d0, d1, d2
-// CHECK-INST: frsqrts d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0xe2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe s0, s1
-// CHECK-INST: frecpe s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe d0, d1
-// CHECK-INST: frecpe d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx s0, s1
-// CHECK-INST: frecpx s0, s1
-// CHECK-ENCODING: [0x20,0xf8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx d0, d1
-// CHECK-INST: frecpx d0, d1
-// CHECK-ENCODING: [0x20,0xf8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte s0, s1
-// CHECK-INST: frsqrte s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte d0, d1
-// CHECK-INST: frsqrte d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-// Vector to GPR integer move instructions
-
-smov w0, v0.b[0]
-// CHECK-INST: smov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.b[0]
-// CHECK-INST: smov x0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov w0, v0.h[0]
-// CHECK-INST: smov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.h[0]
-// CHECK-INST: smov x0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.s[0]
-// CHECK-INST: smov x0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x2c,0x04,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.b[0]
-// CHECK-INST: umov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x3c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.h[0]
-// CHECK-INST: umov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x3c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-// Aliases
-
-mov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-mov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
More information about the llvm-commits
mailing list