[llvm] [AArch64] Let patterns for NEON instructions check runtime mode. (PR #95560)

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 17 07:00:31 PDT 2024


https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/95560

>From 3172366ecbcbd5236a9815702af91cf2752d5d0a Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 14 Jun 2024 16:14:23 +0100
Subject: [PATCH 1/3] [AArch64] Let patterns for NEON instructions check
 runtime mode.

This helps identify any failures where the compiler might otherwise
silently emit instructions that are not valid for the given runtime
mode. We can probably do a similar thing for HasSVE predicates.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  17 +-
 ...streaming-mode-fixed-length-masked-load.ll | 422 ++++---------
 ...treaming-mode-fixed-length-masked-store.ll | 561 ++++++------------
 .../MC/AArch64/SME/streaming-mode-neon-bf16.s |  16 -
 .../test/MC/AArch64/SME/streaming-mode-neon.s | 132 -----
 6 files changed, 289 insertions(+), 862 deletions(-)
 delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
 delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon.s

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac6f1e07c4184..7cbae90ef3ca4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22185,7 +22185,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
   ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
 
   SmallVector<SDValue, 16> MaskConstants;
-  if (VecVT == MVT::v16i8) {
+  if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+      VecVT == MVT::v16i8) {
     // v16i8 is a special case, as we have 16 entries but only 8 positional bits
     // per entry. We split it into two halves, apply the mask, zip the halves to
     // create 8x 16-bit values, and the perform the vector reduce.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index dd54520c8ddad..b183498bd3bd0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -107,7 +107,7 @@ def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">,
 
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
-def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+def HasNEON          : Predicate<"Subtarget->isNeonAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
@@ -234,12 +234,9 @@ def HasSMEF16F16orSMEF8F16
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
 
-// A subset of NEON instructions are legal in Streaming SVE execution mode,
-// they should be enabled if either has been specified.
-def HasNEONorSME
-    : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
-                AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
-                "neon or sme">;
+// A subset of NEON instructions are legal in Streaming SVE execution mode.
+def HasNEONorSME     : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -323,8 +320,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
-def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
-
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
@@ -5934,7 +5929,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEON] in {
+let Predicates = [HasNEONorSME] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -8297,7 +8292,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 
 // Same as above, but the first element is populated using
 // scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [IsNeonAvailable] in {
+let Predicates = [HasNeonOrSME] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
     : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index be335c697707d..029f5d0b8a12a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -315,92 +315,40 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    sub sp, sp, #992
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1008
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #979]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #977]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #980]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #981]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #982]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #976]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1012]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1014]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1016]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1018]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1020]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #983]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
 ; NONEON-NOSVE-NEXT:    add x9, sp, #720
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB2_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
@@ -481,7 +429,7 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB2_19: // %else44
-; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    add sp, sp, #992
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load4
@@ -806,166 +754,62 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #2064
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT:    sub sp, sp, #2000
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2016
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2216]
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2152]
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2272]
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2176]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2160]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2024]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2264]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2016]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2256]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2031]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2248]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2030]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2029]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2232]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2028]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2224]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2027]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2026]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2200]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2025]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2192]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2023]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2184]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2096]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2104]
+; NONEON-NOSVE-NEXT:    sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2088]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2120]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2168]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
 ; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #2021]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2020]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2019]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2088]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2018]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2136]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2017]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2008]
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2104]
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2000]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2080]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2016]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2015]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2120]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2014]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2112]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #2136]
 ; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #2013]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2096]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #2012]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2011]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2010]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2009]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2007]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2006]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2005]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2004]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #2144]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #2016]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2128]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w11, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w11, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x8
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
 ; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2003]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2002]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2001]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #2048]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2000]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2050]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2048]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2052]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2054]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2056]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2058]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2060]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w12, w13
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    str q0, [sp, #2032]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2034]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2032]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2036]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #2038]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #2040]
-; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #2042]
-; NONEON-NOSVE-NEXT:    add w10, w12, w11
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2044]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2062]
-; NONEON-NOSVE-NEXT:    add w13, w13, w14
-; NONEON-NOSVE-NEXT:    add w14, w15, w16
-; NONEON-NOSVE-NEXT:    add w10, w10, w13
-; NONEON-NOSVE-NEXT:    add w11, w14, w11
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2046]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w8, w12
-; NONEON-NOSVE-NEXT:    add w8, w9, w13
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x4
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x20
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w12, w13
+; NONEON-NOSVE-NEXT:    and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT:    add w11, w8, w11
+; NONEON-NOSVE-NEXT:    add w8, w9, w10
 ; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_0
-; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    bfi w8, w11, #16, #16
 ; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    add x9, sp, #1744
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB3_2
@@ -1082,7 +926,7 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
 ; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB3_35: // %else92
-; NONEON-NOSVE-NEXT:    add sp, sp, #2064
+; NONEON-NOSVE-NEXT:    add sp, sp, #2000
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load4
@@ -1995,94 +1839,42 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    sub sp, sp, #992
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1008
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
 ; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #979]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #977]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #980]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #981]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #982]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #976]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #983]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
 ; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add x9, sp, #720
-; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1012]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1014]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1016]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1018]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #1020]
-; NONEON-NOSVE-NEXT:    add w8, w10, w8
-; NONEON-NOSVE-NEXT:    add w10, w11, w12
-; NONEON-NOSVE-NEXT:    add w11, w13, w14
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
-; NONEON-NOSVE-NEXT:    add w10, w11, w15
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1022]
 ; NONEON-NOSVE-NEXT:    add w8, w8, w10
-; NONEON-NOSVE-NEXT:    add w8, w8, w11
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB7_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -2157,7 +1949,7 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
 ; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB7_19: // %else44
-; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    add sp, sp, #992
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index a79ce9db9abfd..5eba12a048cf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -172,89 +172,37 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
@@ -287,7 +235,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
 ; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -336,7 +284,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
 ; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -421,328 +369,219 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #176]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #160]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #264]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #248]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #232]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #224]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #200]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #192]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #22]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #21]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #136]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #104]
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #80]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #120]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #72]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
-; NONEON-NOSVE-NEXT:    zip1 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
 ; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #12]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w16, [sp]
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x8
+; NONEON-NOSVE-NEXT:    and w9, w11, #0x20
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
 ; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #52]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #54]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #58]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #40]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w12, w13
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #42]
-; NONEON-NOSVE-NEXT:    add w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w10, w12, w11
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w12, w13, w14
-; NONEON-NOSVE-NEXT:    add w14, w15, w16
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
-; NONEON-NOSVE-NEXT:    add w10, w10, w12
-; NONEON-NOSVE-NEXT:    add w11, w14, w11
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #46]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w8, w13
-; NONEON-NOSVE-NEXT:    add w8, w9, w12
-; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_34
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x4
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT:    bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x20
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w12, w13
+; NONEON-NOSVE-NEXT:    and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT:    add w11, w8, w11
+; NONEON-NOSVE-NEXT:    add w8, w9, w10
+; NONEON-NOSVE-NEXT:    bfi w8, w11, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_35
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
 ; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
 ; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
 ; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
 ; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
 ; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
 ; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
 ; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
 ; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
 ; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
 ; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
 ; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
 ; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
 ; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
 ; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
 ; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
 ; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
 ; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
 ; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
 ; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
 ; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
 ; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
 ; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
 ; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
 ; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
 ; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
 ; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
 ; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
 ; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
 ; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
 ; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_33
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %cond.store61
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %else62
-; NONEON-NOSVE-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store1
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store5
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store7
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store9
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store11
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store13
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
 ; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store15
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
 ; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store17
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
 ; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store19
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
 ; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store21
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
 ; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store23
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
 ; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store25
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
 ; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store27
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store29
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
 ; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store31
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store33
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
 ; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store35
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
 ; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store37
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
 ; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store39
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
 ; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store41
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
 ; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store43
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
 ; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store45
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
 ; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store47
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
 ; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store49
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
 ; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store51
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
 ; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store53
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
 ; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store55
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
 ; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store57
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
 ; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store59
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:    b .LBB3_33
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -982,89 +821,37 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
@@ -1097,7 +884,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
 ; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -1162,7 +949,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
deleted file mode 100644
index 41868a8c790f1..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN:        | llvm-objdump --mattr=-neon,+sme -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-bfcvt h5, s3
-// CHECK-INST: bfcvt h5, s3
-// CHECK-ENCODING: [0x65,0x40,0x63,0x1e]
-// CHECK-ERROR: instruction requires: bf16 neon or sme
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
deleted file mode 100644
index 138a1fe0bb8e9..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
+++ /dev/null
@@ -1,132 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-// Scalar FP instructions
-
-fmulx s0, s1, s2
-// CHECK-INST: fmulx s0, s1, s2
-// CHECK-ENCODING: [0x20,0xdc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-fmulx d0, d1, d2
-// CHECK-INST: fmulx d0, d1, d2
-// CHECK-ENCODING: [0x20,0xdc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps s0, s1, s2
-// CHECK-INST: frecps s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps d0, d1, d2
-// CHECK-INST: frecps d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts s0, s1, s2
-// CHECK-INST: frsqrts s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0xa2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts d0, d1, d2
-// CHECK-INST: frsqrts d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0xe2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe s0, s1
-// CHECK-INST: frecpe s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe d0, d1
-// CHECK-INST: frecpe d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx s0, s1
-// CHECK-INST: frecpx s0, s1
-// CHECK-ENCODING: [0x20,0xf8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx d0, d1
-// CHECK-INST: frecpx d0, d1
-// CHECK-ENCODING: [0x20,0xf8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte s0, s1
-// CHECK-INST: frsqrte s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte d0, d1
-// CHECK-INST: frsqrte d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-// Vector to GPR integer move instructions
-
-smov w0, v0.b[0]
-// CHECK-INST: smov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.b[0]
-// CHECK-INST: smov x0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov w0, v0.h[0]
-// CHECK-INST: smov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.h[0]
-// CHECK-INST: smov x0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.s[0]
-// CHECK-INST: smov x0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x2c,0x04,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.b[0]
-// CHECK-INST: umov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x3c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.h[0]
-// CHECK-INST: umov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x3c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-// Aliases
-
-mov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-mov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon

>From 0529b2be8ad2673ededb09899785006a8c1038d7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Sun, 16 Jun 2024 11:27:26 +0100
Subject: [PATCH 2/3] Rename HasNEONorSME to HasNEONAndIsStreamingSafe

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 28 +++++++++++----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b183498bd3bd0..0ac619b9edc4d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -234,9 +234,11 @@ def HasSMEF16F16orSMEF8F16
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
 
-// A subset of NEON instructions are legal in Streaming SVE execution mode.
-def HasNEONorSME     : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+// A subset of NEON instructions are legal in Streaming SVE execution mode,
+// so don't need the additional check for 'isStreamingAvailable'.
+def HasNEONAndIsStreamingSafe
+    : Predicate<"Subtarget->hasNEON()">,
+      AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -1321,7 +1323,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
-let Predicates = [HasNEONorSME, HasBF16] in {
+let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
 // Round FP32 to BF16.
 def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5702,9 +5704,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONAndIsStreamingSafe>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONAndIsStreamingSafe>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONAndIsStreamingSafe>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5733,7 +5735,7 @@ let Predicates = [HasRDM] in {
 
 defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
                                           int_aarch64_neon_fmulx,
-                                          [HasNEONorSME]>;
+                                          [HasNEONAndIsStreamingSafe]>;
 
 let Predicates = [HasNEON] in {
 def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5807,9 +5809,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONorSME>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONorSME>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONorSME>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONAndIsStreamingSafe>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONAndIsStreamingSafe>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONAndIsStreamingSafe>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5828,7 +5830,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
           (CMLTv1i64rz V64:$Rn)>;
 
 // Round FP64 to BF16.
-let Predicates = [HasNEONorSME, HasBF16] in
+let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in
 def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
           (BFCVT (FCVTXNv1i64 $Rn))>;
 
@@ -5929,7 +5931,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONorSME] in {
+let Predicates = [HasNEONAndIsStreamingSafe] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),

>From bd914dbb0ffe1888afc75403741cfd62b3b49097 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 17 Jun 2024 14:58:41 +0100
Subject: [PATCH 3/3] Fixed missing cases causing build failures, also
 decapitalised 'and' in HasNEONandIsStreamingSafe

---
 .../lib/Target/AArch64/AArch64InstrFormats.td |  4 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 26 +++++++++----------
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  8 +++---
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1f437d0ed6f8d..75b3ded8a5f3d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7880,7 +7880,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
 multiclass SMov {
   // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorSME] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
     def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
@@ -7927,7 +7927,7 @@ multiclass SMov {
 multiclass UMov {
   // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorSME] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
     def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0ac619b9edc4d..f720b1f1338be 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -235,8 +235,8 @@ def HasSMEF16F16orSMEF8F16
                 "sme-f16f16 or sme-f8f16">;
 
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
-// so don't need the additional check for 'isStreamingAvailable'.
-def HasNEONAndIsStreamingSafe
+// so don't need the additional check for 'isNeonAvailable'.
+def HasNEONandIsStreamingSafe
     : Predicate<"Subtarget->hasNEON()">,
       AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
@@ -1323,7 +1323,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
-let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
 // Round FP32 to BF16.
 def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5704,9 +5704,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONAndIsStreamingSafe>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONAndIsStreamingSafe>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONAndIsStreamingSafe>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5735,7 +5735,7 @@ let Predicates = [HasRDM] in {
 
 defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
                                           int_aarch64_neon_fmulx,
-                                          [HasNEONAndIsStreamingSafe]>;
+                                          [HasNEONandIsStreamingSafe]>;
 
 let Predicates = [HasNEON] in {
 def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5809,9 +5809,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONAndIsStreamingSafe>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONAndIsStreamingSafe>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONAndIsStreamingSafe>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONandIsStreamingSafe>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONandIsStreamingSafe>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONandIsStreamingSafe>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5830,7 +5830,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
           (CMLTv1i64rz V64:$Rn)>;
 
 // Round FP64 to BF16.
-let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
 def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
           (BFCVT (FCVTXNv1i64 $Rn))>;
 
@@ -5931,7 +5931,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONAndIsStreamingSafe] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -8294,7 +8294,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 
 // Same as above, but the first element is populated using
 // scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [HasNeonOrSME] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
     : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd5de628d8529..234bfa40a1296 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in {
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
   // Extract element from vector with immediate index that's within the bottom 128-bits.
-  let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
+  let Predicates = [HasNEONandIsStreamingSafe], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
             (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
@@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in {
             (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
             (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
-  } // End IsNeonAvailable
+  } // End HasNEONandIsStreamingSafe
 
-  let Predicates = [IsNeonAvailable] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
   def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
             (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
@@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in {
 
   def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
-  } // End IsNeonAvailable
+  } // End HasNEONandIsStreamingSafe
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {



More information about the llvm-commits mailing list