[llvm] [AArch64] Let patterns for NEON instructions check runtime mode. (PR #95560)

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 19 05:34:23 PDT 2024


https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/95560

>From 4a33a6f82390dcf92d6cd030ed24774879c4dac2 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 14 Jun 2024 16:14:23 +0100
Subject: [PATCH 1/7] [AArch64] Let patterns for NEON instructions check
 runtime mode.

This helps identify any failures where the compiler might otherwise
silently emit instructions that are not valid for the given runtime
mode. We can probably do a similar thing for HasSVE predicates.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  17 +-
 ...streaming-mode-fixed-length-masked-load.ll | 422 ++++---------
 ...treaming-mode-fixed-length-masked-store.ll | 561 ++++++------------
 .../MC/AArch64/SME/streaming-mode-neon-bf16.s |  16 -
 .../test/MC/AArch64/SME/streaming-mode-neon.s | 132 -----
 6 files changed, 289 insertions(+), 862 deletions(-)
 delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
 delete mode 100644 llvm/test/MC/AArch64/SME/streaming-mode-neon.s

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c5c3ef02115ec..69b1b15afb472 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22271,7 +22271,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
   ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
 
   SmallVector<SDValue, 16> MaskConstants;
-  if (VecVT == MVT::v16i8) {
+  if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+      VecVT == MVT::v16i8) {
     // v16i8 is a special case, as we have 16 entries but only 8 positional bits
     // per entry. We split it into two halves, apply the mask, zip the halves to
     // create 8x 16-bit values, and the perform the vector reduce.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 91e5bc3caa102..aa33ef7b40e47 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -107,7 +107,7 @@ def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">,
 
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
-def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+def HasNEON          : Predicate<"Subtarget->isNeonAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
@@ -234,12 +234,9 @@ def HasSMEF16F16orSMEF8F16
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
 
-// A subset of NEON instructions are legal in Streaming SVE execution mode,
-// they should be enabled if either has been specified.
-def HasNEONorSME
-    : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
-                AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
-                "neon or sme">;
+// A subset of NEON instructions are legal in Streaming SVE execution mode.
+def HasNEONorSME     : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -323,8 +320,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
-def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
-
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
@@ -6016,7 +6011,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEON] in {
+let Predicates = [HasNEONorSME] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -8379,7 +8374,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 
 // Same as above, but the first element is populated using
 // scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [IsNeonAvailable] in {
+let Predicates = [HasNeonOrSME] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
     : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index a689a539b0082..5f4b9dd1592cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -315,92 +315,40 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    sub sp, sp, #992
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1008
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #979]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #977]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #980]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #981]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #982]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #976]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1012]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1014]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1016]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1018]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1020]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #983]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
 ; NONEON-NOSVE-NEXT:    add x9, sp, #720
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB2_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
@@ -481,7 +429,7 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB2_19: // %else44
-; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    add sp, sp, #992
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load4
@@ -806,166 +754,62 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #2064
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT:    sub sp, sp, #2000
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2016
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2216]
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2152]
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2272]
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2176]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2160]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2024]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2264]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2016]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2256]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2031]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2248]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2030]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2029]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2232]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2028]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2224]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2027]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2026]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2200]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2025]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2192]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2023]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2184]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2096]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2104]
+; NONEON-NOSVE-NEXT:    sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2088]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2120]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2168]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
 ; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #2021]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2020]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2019]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2088]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2018]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2136]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2017]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2008]
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2104]
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2000]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2080]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2016]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2015]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2120]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2014]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2112]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #2136]
 ; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #2013]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2096]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #2012]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2011]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2010]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2009]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2007]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2006]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2005]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2004]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #2144]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #2016]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2128]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w11, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w11, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x8
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
 ; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2003]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2002]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2001]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #2048]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2000]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2050]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2048]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2052]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2054]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2056]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2058]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2060]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w12, w13
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    str q0, [sp, #2032]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2034]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2032]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2036]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #2038]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #2040]
-; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #2042]
-; NONEON-NOSVE-NEXT:    add w10, w12, w11
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2044]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2062]
-; NONEON-NOSVE-NEXT:    add w13, w13, w14
-; NONEON-NOSVE-NEXT:    add w14, w15, w16
-; NONEON-NOSVE-NEXT:    add w10, w10, w13
-; NONEON-NOSVE-NEXT:    add w11, w14, w11
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2046]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w8, w12
-; NONEON-NOSVE-NEXT:    add w8, w9, w13
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x4
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x20
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w12, w13
+; NONEON-NOSVE-NEXT:    and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT:    add w11, w8, w11
+; NONEON-NOSVE-NEXT:    add w8, w9, w10
 ; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_0
-; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    bfi w8, w11, #16, #16
 ; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    add x9, sp, #1744
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB3_2
@@ -1083,7 +927,7 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
 ; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB3_35: // %else92
-; NONEON-NOSVE-NEXT:    add sp, sp, #2064
+; NONEON-NOSVE-NEXT:    add sp, sp, #2000
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load4
@@ -1996,94 +1840,42 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-LABEL: masked_load_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    sub sp, sp, #992
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1008
 ; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
 ; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #979]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #977]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #980]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #981]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #982]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #976]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #983]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
 ; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add x9, sp, #720
-; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1008]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1012]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1014]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1016]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1018]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #1020]
-; NONEON-NOSVE-NEXT:    add w8, w10, w8
-; NONEON-NOSVE-NEXT:    add w10, w11, w12
-; NONEON-NOSVE-NEXT:    add w11, w13, w14
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
-; NONEON-NOSVE-NEXT:    add w10, w11, w15
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1022]
 ; NONEON-NOSVE-NEXT:    add w8, w8, w10
-; NONEON-NOSVE-NEXT:    add w8, w8, w11
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB7_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -2159,7 +1951,7 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
 ; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:  .LBB7_19: // %else44
-; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    add sp, sp, #992
 ; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 13b83d2ae3f07..0c3411e5f5514 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -172,89 +172,37 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
@@ -287,7 +235,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
 ; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -336,7 +284,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
 ; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -421,328 +369,219 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #176]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #160]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #264]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #248]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #232]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #224]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #200]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #192]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #22]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #21]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #136]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #104]
-; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp]
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #80]
-; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #120]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
-; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w15, w7, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #72]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
-; NONEON-NOSVE-NEXT:    zip1 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
 ; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
-; NONEON-NOSVE-NEXT:    strb w10, [sp, #12]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
-; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
-; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
-; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
-; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
-; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
-; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w16, [sp]
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w12, w4, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    sbfx w10, w13, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    sbfx w13, w5, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x8
+; NONEON-NOSVE-NEXT:    and w9, w11, #0x20
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w11, w3, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
 ; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
-; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #52]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #54]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #58]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #40]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w12, w13
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #42]
-; NONEON-NOSVE-NEXT:    add w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w10, w12, w11
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w12, w13, w14
-; NONEON-NOSVE-NEXT:    add w14, w15, w16
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
-; NONEON-NOSVE-NEXT:    add w10, w10, w12
-; NONEON-NOSVE-NEXT:    add w11, w14, w11
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #46]
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w8, w13
-; NONEON-NOSVE-NEXT:    add w8, w9, w12
-; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_34
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x4
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w13, w6, #0, #1
+; NONEON-NOSVE-NEXT:    bfxil w10, w1, #0, #1
+; NONEON-NOSVE-NEXT:    orr w11, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w13, #0x20
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w15, w16, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w12, w13
+; NONEON-NOSVE-NEXT:    and w11, w14, #0xff80
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w10, w15, #0xff80
+; NONEON-NOSVE-NEXT:    add w11, w8, w11
+; NONEON-NOSVE-NEXT:    add w8, w9, w10
+; NONEON-NOSVE-NEXT:    bfi w8, w11, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_35
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
 ; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
 ; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
 ; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
 ; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
 ; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
 ; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
 ; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
 ; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
 ; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
 ; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
 ; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
 ; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
 ; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
 ; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
 ; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
 ; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
 ; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
 ; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
 ; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
 ; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
 ; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
 ; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
 ; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
 ; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
 ; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
 ; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
 ; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
 ; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
 ; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
 ; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_33
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %cond.store61
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %else62
-; NONEON-NOSVE-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store1
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store5
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store7
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store9
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store11
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store13
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
 ; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store15
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
 ; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store17
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
 ; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store19
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
 ; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store21
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
 ; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store23
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
 ; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store25
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
 ; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store27
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store29
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
 ; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store31
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store33
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
 ; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store35
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
 ; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store37
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
 ; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store39
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
 ; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store41
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
 ; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store43
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
 ; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store45
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
 ; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store47
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
 ; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store49
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
 ; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store51
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
 ; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store53
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
 ; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store55
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
 ; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store57
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
 ; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store59
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:    b .LBB3_33
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -981,89 +820,37 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
 ; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
 ; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
-; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
-; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
-; NONEON-NOSVE-NEXT:    add w8, w9, w8
-; NONEON-NOSVE-NEXT:    add w9, w10, w11
-; NONEON-NOSVE-NEXT:    add w10, w12, w13
-; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w9, w10, w14
-; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xffffff80
 ; NONEON-NOSVE-NEXT:    add w8, w8, w9
-; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
@@ -1096,7 +883,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
 ; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -1161,7 +948,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
deleted file mode 100644
index 41868a8c790f1..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon-bf16.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN:        | llvm-objdump --mattr=-neon,+sme -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-bfcvt h5, s3
-// CHECK-INST: bfcvt h5, s3
-// CHECK-ENCODING: [0x65,0x40,0x63,0x1e]
-// CHECK-ERROR: instruction requires: bf16 neon or sme
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
deleted file mode 100644
index 138a1fe0bb8e9..0000000000000
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
+++ /dev/null
@@ -1,132 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=-neon < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=-neon,+sme < %s \
-// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
-// Disassemble encoding and check the re-encoding (-show-encoding) matches.
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=-neon,+sme < %s \
-// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=-neon,+sme -disassemble -show-encoding \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-
-// Scalar FP instructions
-
-fmulx s0, s1, s2
-// CHECK-INST: fmulx s0, s1, s2
-// CHECK-ENCODING: [0x20,0xdc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-fmulx d0, d1, d2
-// CHECK-INST: fmulx d0, d1, d2
-// CHECK-ENCODING: [0x20,0xdc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps s0, s1, s2
-// CHECK-INST: frecps s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecps d0, d1, d2
-// CHECK-INST: frecps d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts s0, s1, s2
-// CHECK-INST: frsqrts s0, s1, s2
-// CHECK-ENCODING: [0x20,0xfc,0xa2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrts d0, d1, d2
-// CHECK-INST: frsqrts d0, d1, d2
-// CHECK-ENCODING: [0x20,0xfc,0xe2,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe s0, s1
-// CHECK-INST: frecpe s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpe d0, d1
-// CHECK-INST: frecpe d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx s0, s1
-// CHECK-INST: frecpx s0, s1
-// CHECK-ENCODING: [0x20,0xf8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frecpx d0, d1
-// CHECK-INST: frecpx d0, d1
-// CHECK-ENCODING: [0x20,0xf8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte s0, s1
-// CHECK-INST: frsqrte s0, s1
-// CHECK-ENCODING: [0x20,0xd8,0xa1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-frsqrte d0, d1
-// CHECK-INST: frsqrte d0, d1
-// CHECK-ENCODING: [0x20,0xd8,0xe1,0x7e]
-// CHECK-ERROR: instruction requires: neon or sme
-
-// Vector to GPR integer move instructions
-
-smov w0, v0.b[0]
-// CHECK-INST: smov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.b[0]
-// CHECK-INST: smov x0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x2c,0x01,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov w0, v0.h[0]
-// CHECK-INST: smov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.h[0]
-// CHECK-INST: smov x0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x2c,0x02,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-smov x0, v0.s[0]
-// CHECK-INST: smov x0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x2c,0x04,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.b[0]
-// CHECK-INST: umov w0, v0.b[0]
-// CHECK-ENCODING: [0x00,0x3c,0x01,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.h[0]
-// CHECK-INST: umov w0, v0.h[0]
-// CHECK-ENCODING: [0x00,0x3c,0x02,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-umov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon
-
-// Aliases
-
-mov w0, v0.s[0]
-// CHECK-INST: mov w0, v0.s[0]
-// CHECK-ENCODING: [0x00,0x3c,0x04,0x0e]
-// CHECK-ERROR: instruction requires: neon
-
-mov x0, v0.d[0]
-// CHECK-INST: mov x0, v0.d[0]
-// CHECK-ENCODING: [0x00,0x3c,0x08,0x4e]
-// CHECK-ERROR: instruction requires: neon

>From f48cb9cdbdec8e6ce4292b3e21bce959b4b89a07 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Sun, 16 Jun 2024 11:27:26 +0100
Subject: [PATCH 2/7] Rename HasNEONorSME to HasNEONAndIsStreamingSafe

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 28 +++++++++++----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index aa33ef7b40e47..a6e66808bc7ae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -234,9 +234,11 @@ def HasSMEF16F16orSMEF8F16
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
 
-// A subset of NEON instructions are legal in Streaming SVE execution mode.
-def HasNEONorSME     : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+// A subset of NEON instructions are legal in Streaming SVE execution mode,
+// so don't need the additional check for 'isStreamingAvailable'.
+def HasNEONAndIsStreamingSafe
+    : Predicate<"Subtarget->hasNEON()">,
+      AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -1345,7 +1347,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
-let Predicates = [HasNEONorSME, HasBF16] in {
+let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
 // Round FP32 to BF16.
 def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5784,9 +5786,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONAndIsStreamingSafe>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONAndIsStreamingSafe>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONAndIsStreamingSafe>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5815,7 +5817,7 @@ let Predicates = [HasRDM] in {
 
 defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
                                           int_aarch64_neon_fmulx,
-                                          [HasNEONorSME]>;
+                                          [HasNEONAndIsStreamingSafe]>;
 
 let Predicates = [HasNEON] in {
 def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5889,9 +5891,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONorSME>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONorSME>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONorSME>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONAndIsStreamingSafe>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONAndIsStreamingSafe>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONAndIsStreamingSafe>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5910,7 +5912,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
           (CMLTv1i64rz V64:$Rn)>;
 
 // Round FP64 to BF16.
-let Predicates = [HasNEONorSME, HasBF16] in
+let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in
 def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
           (BFCVT (FCVTXNv1i64 $Rn))>;
 
@@ -6011,7 +6013,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONorSME] in {
+let Predicates = [HasNEONAndIsStreamingSafe] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),

>From 1a85524926b832d0196c9abebc3efa6136eaaa64 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 17 Jun 2024 14:58:41 +0100
Subject: [PATCH 3/7] Fixed missing cases causing build failures, also
 decapitalised 'and' in HasNEONandIsStreamingSafe

---
 .../lib/Target/AArch64/AArch64InstrFormats.td |  4 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 26 +++++++++----------
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  8 +++---
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 17d011086634c..608698954df42 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7880,7 +7880,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
 multiclass SMov {
   // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorSME] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
     def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
@@ -7927,7 +7927,7 @@ multiclass SMov {
 multiclass UMov {
   // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
   // streaming mode.
-  let Predicates = [HasNEONorSME] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
     def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
       let Inst{20-16} = 0b00001;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a6e66808bc7ae..a69f6663babe1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -235,8 +235,8 @@ def HasSMEF16F16orSMEF8F16
                 "sme-f16f16 or sme-f8f16">;
 
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
-// so don't need the additional check for 'isStreamingAvailable'.
-def HasNEONAndIsStreamingSafe
+// so don't need the additional check for 'isNeonAvailable'.
+def HasNEONandIsStreamingSafe
     : Predicate<"Subtarget->hasNEON()">,
       AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
@@ -1347,7 +1347,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
-let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
 // Round FP32 to BF16.
 def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5786,9 +5786,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONAndIsStreamingSafe>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONAndIsStreamingSafe>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONAndIsStreamingSafe>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5817,7 +5817,7 @@ let Predicates = [HasRDM] in {
 
 defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
                                           int_aarch64_neon_fmulx,
-                                          [HasNEONAndIsStreamingSafe]>;
+                                          [HasNEONandIsStreamingSafe]>;
 
 let Predicates = [HasNEON] in {
 def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5891,9 +5891,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONAndIsStreamingSafe>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONAndIsStreamingSafe>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONAndIsStreamingSafe>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONandIsStreamingSafe>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONandIsStreamingSafe>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONandIsStreamingSafe>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5912,7 +5912,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
           (CMLTv1i64rz V64:$Rn)>;
 
 // Round FP64 to BF16.
-let Predicates = [HasNEONAndIsStreamingSafe, HasBF16] in
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
 def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
           (BFCVT (FCVTXNv1i64 $Rn))>;
 
@@ -6013,7 +6013,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONAndIsStreamingSafe] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -8376,7 +8376,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 
 // Same as above, but the first element is populated using
 // scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [HasNeonOrSME] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
     : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd5de628d8529..234bfa40a1296 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in {
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
   // Extract element from vector with immediate index that's within the bottom 128-bits.
-  let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
+  let Predicates = [HasNEONandIsStreamingSafe], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
             (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
@@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in {
             (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
             (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
-  } // End IsNeonAvailable
+  } // End HasNEONandIsStreamingSafe
 
-  let Predicates = [IsNeonAvailable] in {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
   def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
             (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
@@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in {
 
   def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
-  } // End IsNeonAvailable
+  } // End HasNEONandIsStreamingSafe
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {

>From 97f2a88524ea7edd5dcb1da006c3c9f8d83d4af9 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 17 Jun 2024 16:13:37 +0100
Subject: [PATCH 4/7] Used wrong predicate in previous patch

---
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 234bfa40a1296..a3c41f2e052cd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in {
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
   // Extract element from vector with immediate index that's within the bottom 128-bits.
-  let Predicates = [HasNEONandIsStreamingSafe], AddedComplexity = 1 in {
+  let Predicates = [HasNEON], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
             (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
@@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in {
             (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
             (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
-  } // End HasNEONandIsStreamingSafe
+  } // End HasNEON
 
-  let Predicates = [HasNEONandIsStreamingSafe] in {
+  let Predicates = [HasNEON] in {
   def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
             (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
@@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in {
 
   def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
-  } // End HasNEONandIsStreamingSafe
+  } // End HasNEON
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {

>From 1c5c3b85a6b4970d61871e173fd2bc0b83544440 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 17 Jun 2024 17:09:47 +0100
Subject: [PATCH 5/7] Fix one more predicate

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a69f6663babe1..4b031fd6d546d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8376,7 +8376,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 
 // Same as above, but the first element is populated using
 // scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+let Predicates = [HasNEON] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
     : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),

>From 48c0acd5f4ed4d9fc7790cd5129e4ec6ac0fb0cb Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 19 Jun 2024 09:33:58 +0100
Subject: [PATCH 6/7] Fix predicates for UCVTF/SCVTF and update comment.

---
 llvm/lib/Target/AArch64/AArch64InstrFormats.td | 4 +++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td    | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 608698954df42..1948c5a268114 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7616,11 +7616,13 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
 let mayRaiseFPException = 1, Uses = [FPCR] in
 multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
   def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
                                 [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
   def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
-  let Predicates = [HasNEON, HasFullFP16] in {
+  }
+  let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
   def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
                                 [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4b031fd6d546d..19b359f3bf0e2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6023,7 +6023,7 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
 def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
-let Predicates = [HasFullFP16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
 def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6115,7 +6115,7 @@ def : Pat <(f64 (uint_to_fp (i32
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
-} // let Predicates = [HasNEON]
+} // let Predicates = [HasNEONandIsStreamingSafe]
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.

>From 1e52110719798fed51376a56d084a50e1afab957 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 19 Jun 2024 13:32:36 +0100
Subject: [PATCH 7/7] Move HasNEONandIsStreamingSafe predicate to multiclass,
 since all instructions are affected

---
 llvm/lib/Target/AArch64/AArch64InstrFormats.td | 7 +++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.td    | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1948c5a268114..e1ecc5a57dd26 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7602,13 +7602,12 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
 }
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
-multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
-                           Predicate pred = HasNEON> {
-  let Predicates = [pred] in {
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+  let Predicates = [HasNEONandIsStreamingSafe] in {
   def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
   def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
   }
-  let Predicates = [pred, HasFullFP16] in {
+  let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
   def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 19b359f3bf0e2..6afee9bd388a6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5891,9 +5891,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONandIsStreamingSafe>;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONandIsStreamingSafe>;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONandIsStreamingSafe>;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;



More information about the llvm-commits mailing list