[llvm] 86972f1 - [AArch64][SVE] Use TargetFrameIndex in more SVE load/store addressing modes

Bradley Smith via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 29 08:00:23 PDT 2021


Author: Bradley Smith
Date: 2021-10-29T14:44:16Z
New Revision: 86972f111497bd15df8da181d0d7ac68b866320b

URL: https://github.com/llvm/llvm-project/commit/86972f111497bd15df8da181d0d7ac68b866320b
DIFF: https://github.com/llvm/llvm-project/commit/86972f111497bd15df8da181d0d7ac68b866320b.diff

LOG: [AArch64][SVE] Use TargetFrameIndex in more SVE load/store addressing modes

Add support for generating TargetFrameIndex in complex patterns for
indexed addressing modes in SVE. Additionally, add missing load/stores
to getMemOpInfo and getLoadStoreImmIdx.

Differential Revision: https://reviews.llvm.org/D112617

Added: 
    llvm/test/CodeGen/AArch64/sve-ldnf1.mir
    llvm/test/CodeGen/AArch64/sve-ldstnt1.mir

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
    llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
    llvm/test/CodeGen/AArch64/sve-insert-element.ll
    llvm/test/CodeGen/AArch64/sve-insert-vector.ll
    llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
    llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7fe1ecaa68bbe..fe9b2f8883b9d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5050,6 +5050,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
                                                    SDValue &Base,
                                                    SDValue &OffImm) {
   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
+  const DataLayout &DL = CurDAG->getDataLayout();
+
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+    return true;
+  }
 
   if (MemVT == EVT())
     return false;
@@ -5073,6 +5081,11 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     return false;
 
   Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+  }
+
   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
   return true;
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6f5e32c5ced58..c79c19b2fbebd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2263,32 +2263,35 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STNPSi:
   case AArch64::LDG:
   case AArch64::STGPi:
+
   case AArch64::LD1B_IMM:
-  case AArch64::LD1H_IMM:
-  case AArch64::LD1W_IMM:
-  case AArch64::LD1D_IMM:
-  case AArch64::ST1B_IMM:
-  case AArch64::ST1H_IMM:
-  case AArch64::ST1W_IMM:
-  case AArch64::ST1D_IMM:
   case AArch64::LD1B_H_IMM:
+  case AArch64::LD1B_S_IMM:
+  case AArch64::LD1B_D_IMM:
   case AArch64::LD1SB_H_IMM:
+  case AArch64::LD1SB_S_IMM:
+  case AArch64::LD1SB_D_IMM:
+  case AArch64::LD1H_IMM:
   case AArch64::LD1H_S_IMM:
+  case AArch64::LD1H_D_IMM:
   case AArch64::LD1SH_S_IMM:
+  case AArch64::LD1SH_D_IMM:
+  case AArch64::LD1W_IMM:
   case AArch64::LD1W_D_IMM:
   case AArch64::LD1SW_D_IMM:
+  case AArch64::LD1D_IMM:
+
+  case AArch64::ST1B_IMM:
   case AArch64::ST1B_H_IMM:
-  case AArch64::ST1H_S_IMM:
-  case AArch64::ST1W_D_IMM:
-  case AArch64::LD1B_S_IMM:
-  case AArch64::LD1SB_S_IMM:
-  case AArch64::LD1H_D_IMM:
-  case AArch64::LD1SH_D_IMM:
   case AArch64::ST1B_S_IMM:
-  case AArch64::ST1H_D_IMM:
-  case AArch64::LD1B_D_IMM:
-  case AArch64::LD1SB_D_IMM:
   case AArch64::ST1B_D_IMM:
+  case AArch64::ST1H_IMM:
+  case AArch64::ST1H_S_IMM:
+  case AArch64::ST1H_D_IMM:
+  case AArch64::ST1W_IMM:
+  case AArch64::ST1W_D_IMM:
+  case AArch64::ST1D_IMM:
+
   case AArch64::LD1RB_IMM:
   case AArch64::LD1RB_H_IMM:
   case AArch64::LD1RB_S_IMM:
@@ -2305,6 +2308,32 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LD1RW_D_IMM:
   case AArch64::LD1RSW_IMM:
   case AArch64::LD1RD_IMM:
+
+  case AArch64::LDNT1B_ZRI:
+  case AArch64::LDNT1H_ZRI:
+  case AArch64::LDNT1W_ZRI:
+  case AArch64::LDNT1D_ZRI:
+  case AArch64::STNT1B_ZRI:
+  case AArch64::STNT1H_ZRI:
+  case AArch64::STNT1W_ZRI:
+  case AArch64::STNT1D_ZRI:
+
+  case AArch64::LDNF1B_IMM:
+  case AArch64::LDNF1B_H_IMM:
+  case AArch64::LDNF1B_S_IMM:
+  case AArch64::LDNF1B_D_IMM:
+  case AArch64::LDNF1SB_H_IMM:
+  case AArch64::LDNF1SB_S_IMM:
+  case AArch64::LDNF1SB_D_IMM:
+  case AArch64::LDNF1H_IMM:
+  case AArch64::LDNF1H_S_IMM:
+  case AArch64::LDNF1H_D_IMM:
+  case AArch64::LDNF1SH_S_IMM:
+  case AArch64::LDNF1SH_D_IMM:
+  case AArch64::LDNF1W_IMM:
+  case AArch64::LDNF1W_D_IMM:
+  case AArch64::LDNF1SW_D_IMM:
+  case AArch64::LDNF1D_IMM:
     return 3;
   case AArch64::ADDG:
   case AArch64::STGOffset:
@@ -2855,10 +2884,22 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1H_IMM:
   case AArch64::LD1W_IMM:
   case AArch64::LD1D_IMM:
+  case AArch64::LDNT1B_ZRI:
+  case AArch64::LDNT1H_ZRI:
+  case AArch64::LDNT1W_ZRI:
+  case AArch64::LDNT1D_ZRI:
   case AArch64::ST1B_IMM:
   case AArch64::ST1H_IMM:
   case AArch64::ST1W_IMM:
   case AArch64::ST1D_IMM:
+  case AArch64::STNT1B_ZRI:
+  case AArch64::STNT1H_ZRI:
+  case AArch64::STNT1W_ZRI:
+  case AArch64::STNT1D_ZRI:
+  case AArch64::LDNF1B_IMM:
+  case AArch64::LDNF1H_IMM:
+  case AArch64::LDNF1W_IMM:
+  case AArch64::LDNF1D_IMM:
     // A full vectors worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(16);
@@ -2875,6 +2916,12 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::ST1B_H_IMM:
   case AArch64::ST1H_S_IMM:
   case AArch64::ST1W_D_IMM:
+  case AArch64::LDNF1B_H_IMM:
+  case AArch64::LDNF1SB_H_IMM:
+  case AArch64::LDNF1H_S_IMM:
+  case AArch64::LDNF1SH_S_IMM:
+  case AArch64::LDNF1W_D_IMM:
+  case AArch64::LDNF1SW_D_IMM:
     // A half vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(8);
@@ -2888,6 +2935,10 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1SH_D_IMM:
   case AArch64::ST1B_S_IMM:
   case AArch64::ST1H_D_IMM:
+  case AArch64::LDNF1B_S_IMM:
+  case AArch64::LDNF1SB_S_IMM:
+  case AArch64::LDNF1H_D_IMM:
+  case AArch64::LDNF1SH_D_IMM:
     // A quarter vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(4);
@@ -2898,6 +2949,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1SB_D_IMM:
   case AArch64::ST1B_D_IMM:
+  case AArch64::LDNF1B_D_IMM:
+  case AArch64::LDNF1SB_D_IMM:
     // A eighth vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(2);

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0ac0eb2999d29..67d8fbb45cf53 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2200,10 +2200,6 @@ let Predicates = [HasSVEorStreamingSVE] in {
       def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
                      (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
     }
-    let AddedComplexity = 3 in {
-      def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
-                    (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
-    }
 
     def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
               (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
@@ -2240,10 +2236,6 @@ let Predicates = [HasSVEorStreamingSVE] in {
       def _imm: Pat<(Ty (Load  (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
                     (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
     }
-    let AddedComplexity = 3 in {
-      def _fi : Pat<(Ty (Load  (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
-                    (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
-    }
 
     def : Pat<(Ty (Load GPR64:$base)),
               (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;

diff  --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 0077ea3b7ff27..91b2281b167e6 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -40,14 +40,14 @@ define <vscale x 16 x i8> @splice_nxv16i8_clamped_idx(<vscale x 16 x i8> %a, <vs
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mov w10, #256
+; CHECK-NEXT:    mov w9, #256
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
-; CHECK-NEXT:    st1b { z1.b }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #256
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -81,14 +81,14 @@ define <vscale x 8 x i16> @splice_nxv8i16_clamped_idx(<vscale x 8 x i16> %a, <vs
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #128
-; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -122,14 +122,14 @@ define <vscale x 4 x i32> @splice_nxv4i32_clamped_idx(<vscale x 4 x i32> %a, <vs
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #64
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #64
-; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -163,14 +163,14 @@ define <vscale x 2 x i64> @splice_nxv2i64_clamped_idx(<vscale x 2 x i64> %a, <vs
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #32
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #32
-; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -200,7 +200,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK-NEXT:    mov x9, #-8
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -237,15 +237,15 @@ define <vscale x 2 x half> @splice_nxv2f16_clamped_idx(<vscale x 2 x half> %a, <
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    mov w9, #32
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1h { z1.h }, p0, [x10, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x10, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -274,7 +274,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK-NEXT:    mov x9, #-6
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -311,15 +311,15 @@ define <vscale x 4 x half> @splice_nxv4f16_clamped_idx(<vscale x 4 x half> %a, <
 ; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:    mov w9, #64
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    st1h { z1.h }, p0, [x10, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x10, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -352,14 +352,14 @@ define <vscale x 8 x half> @splice_nxv8f16_clamped_idx(<vscale x 8 x half> %a, <
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #128
-; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -389,7 +389,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    mov x9, #-4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -426,15 +426,15 @@ define <vscale x 2 x float> @splice_nxv2f32_clamped_idx(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    mov w9, #32
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1w { z1.s }, p0, [x10, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x10, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -467,14 +467,14 @@ define <vscale x 4 x float> @splice_nxv4f32_clamped_idx(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #64
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #64
-; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #64
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -508,14 +508,14 @@ define <vscale x 2 x double> @splice_nxv2f64_clamped_idx(<vscale x 2 x double> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #32
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #32
-; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x8, #32
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -602,13 +602,13 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    orr x9, x8, #0x8
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    orr x8, x8, #0x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x8, #3, mul vl]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -623,21 +623,21 @@ define <vscale x 16 x float> @splice_nxv16f32_clamped_idx(<vscale x 16 x float>
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w10, #16
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #16
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #16
-; CHECK-NEXT:    st1w { z3.s }, p0, [x9, #3, mul vl]
-; CHECK-NEXT:    csel x8, x8, x10, lo
-; CHECK-NEXT:    st1w { z2.s }, p0, [x9, #2, mul vl]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
+; CHECK-NEXT:    st1w { z4.s }, p0, [sp, #4, mul vl]
 ; CHECK-NEXT:    add x10, x9, x8, lsl #2
-; CHECK-NEXT:    st1w { z7.s }, p0, [x9, #7, mul vl]
-; CHECK-NEXT:    st1w { z4.s }, p0, [x9, #4, mul vl]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x9, #5, mul vl]
-; CHECK-NEXT:    st1w { z6.s }, p0, [x9, #6, mul vl]
+; CHECK-NEXT:    st1w { z5.s }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1w { z6.s }, p0, [sp, #6, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x10, #2, mul vl]
@@ -662,7 +662,7 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    mov x9, #-16
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK-NEXT:    st1b { z1.b }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, x9]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -696,11 +696,11 @@ define <vscale x 16 x i8> @splice_nxv16i8_clamped(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    mov w10, #17
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1b { z1.b }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -717,7 +717,7 @@ define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i
 ; CHECK-NEXT:    mov x9, #-8
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -751,11 +751,11 @@ define <vscale x 8 x i16> @splice_nxv8i16_clamped(<vscale x 8 x i16> %a, <vscale
 ; CHECK-NEXT:    mov w10, #18
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -772,7 +772,7 @@ define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i
 ; CHECK-NEXT:    mov x9, #-4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -806,11 +806,11 @@ define <vscale x 4 x i32> @splice_nxv4i32_clamped(<vscale x 4 x i32> %a, <vscale
 ; CHECK-NEXT:    mov w10, #20
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -827,7 +827,7 @@ define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i
 ; CHECK-NEXT:    mov x9, #-2
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -861,11 +861,11 @@ define <vscale x 2 x i64> @splice_nxv2i64_clamped(<vscale x 2 x i64> %a, <vscale
 ; CHECK-NEXT:    mov w10, #24
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -882,7 +882,7 @@ define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x
 ; CHECK-NEXT:    mov x9, #-8
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -916,11 +916,11 @@ define <vscale x 8 x half> @splice_nxv8f16_clamped(<vscale x 8 x half> %a, <vsca
 ; CHECK-NEXT:    mov w10, #18
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -937,7 +937,7 @@ define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4
 ; CHECK-NEXT:    mov x9, #-4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -971,11 +971,11 @@ define <vscale x 4 x float> @splice_nxv4f32_clamped(<vscale x 4 x float> %a, <vs
 ; CHECK-NEXT:    mov w10, #20
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -992,7 +992,7 @@ define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x
 ; CHECK-NEXT:    mov x9, #-2
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -1026,11 +1026,11 @@ define <vscale x 2 x double> @splice_nxv2f64_clamped(<vscale x 2 x double> %a, <
 ; CHECK-NEXT:    mov w10, #24
 ; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    addvl x10, x8, #1
+; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1112,7 +1112,7 @@ define <vscale x 2 x i8> @splice_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    mov x9, #-2
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -1129,16 +1129,16 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov x10, #-8
+; CHECK-NEXT:    mov x9, #-8
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    addvl x9, x8, #2
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    sub x11, x9, #32
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x8, #3, mul vl]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x10, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x11, #1, mul vl]
+; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    sub x10, x8, #32
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1152,26 +1152,26 @@ define <vscale x 16 x float> @splice_nxv16f32_clamped(<vscale x 16 x float> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    rdvl x9, #4
-; CHECK-NEXT:    cmp x9, #68
-; CHECK-NEXT:    mov w10, #68
-; CHECK-NEXT:    csel x9, x9, x10, lo
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    rdvl x8, #4
+; CHECK-NEXT:    cmp x8, #68
+; CHECK-NEXT:    mov w9, #68
+; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    addvl x10, x8, #4
-; CHECK-NEXT:    st1w { z3.s }, p0, [x8, #3, mul vl]
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    st1w { z2.s }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    addvl x9, x10, #4
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    st1w { z7.s }, p0, [x8, #7, mul vl]
-; CHECK-NEXT:    st1w { z4.s }, p0, [x8, #4, mul vl]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x8, #5, mul vl]
-; CHECK-NEXT:    st1w { z6.s }, p0, [x8, #6, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x9, #1, mul vl]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x9, #2, mul vl]
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x9, #3, mul vl]
+; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
+; CHECK-NEXT:    st1w { z4.s }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    st1w { z5.s }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1w { z6.s }, p0, [sp, #6, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index b27b7a1e32500..40af9abcc555b 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -12,17 +12,16 @@ define float @foo1(double* %x0, double* %x1, double* %x2) nounwind {
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x2]
-; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z17.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    bl callee1
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
@@ -44,24 +43,27 @@ define float @foo2(double* %x0, double* %x1) nounwind {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    add x9, sp, #16
 ; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    mov w1, #1
 ; CHECK-NEXT:    mov w2, #2
+; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
+; CHECK-NEXT:    add x9, sp, #16
 ; CHECK-NEXT:    mov w3, #3
 ; CHECK-NEXT:    mov w4, #4
 ; CHECK-NEXT:    mov w5, #5
 ; CHECK-NEXT:    mov w6, #6
+; CHECK-NEXT:    st1d { z17.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    add x9, sp, #16
 ; CHECK-NEXT:    mov w7, #7
+; CHECK-NEXT:    st1d { z18.d }, p0, [x9, #2, mul vl]
 ; CHECK-NEXT:    add x9, sp, #16
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
-; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x9, #3, mul vl]
 ; CHECK-NEXT:    str x8, [sp]
 ; CHECK-NEXT:    bl callee2
 ; CHECK-NEXT:    addvl sp, sp, #4
@@ -83,17 +85,16 @@ define float @foo3(double* %x0, double* %x1, double* %x2) nounwind {
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld3d { z16.d, z17.d, z18.d }, p0/z, [x1]
 ; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2]
-; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    fmov s1, #2.00000000
 ; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z17.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    bl callee3
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 4532ba311a8be..8f786863094a3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -55,43 +55,42 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrh w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrh w8, [x2]
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    sbfx w10, w9, #15, #1
-; CHECK-NEXT:    sbfx w11, w9, #14, #1
-; CHECK-NEXT:    sbfx w12, w9, #13, #1
-; CHECK-NEXT:    strh w10, [sp, #30]
-; CHECK-NEXT:    sbfx w10, w9, #12, #1
-; CHECK-NEXT:    strh w11, [sp, #28]
-; CHECK-NEXT:    sbfx w11, w9, #11, #1
-; CHECK-NEXT:    strh w12, [sp, #26]
-; CHECK-NEXT:    sbfx w12, w9, #10, #1
-; CHECK-NEXT:    strh w10, [sp, #24]
-; CHECK-NEXT:    sbfx w10, w9, #9, #1
-; CHECK-NEXT:    strh w11, [sp, #22]
-; CHECK-NEXT:    sbfx w11, w9, #8, #1
-; CHECK-NEXT:    strh w12, [sp, #20]
-; CHECK-NEXT:    sbfx w12, w9, #7, #1
-; CHECK-NEXT:    strh w10, [sp, #18]
-; CHECK-NEXT:    sbfx w10, w9, #6, #1
-; CHECK-NEXT:    strh w11, [sp, #16]
-; CHECK-NEXT:    sbfx w11, w9, #5, #1
-; CHECK-NEXT:    strh w12, [sp, #14]
-; CHECK-NEXT:    sbfx w12, w9, #4, #1
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    sbfx w10, w9, #3, #1
-; CHECK-NEXT:    strh w11, [sp, #10]
-; CHECK-NEXT:    sbfx w11, w9, #2, #1
-; CHECK-NEXT:    strh w12, [sp, #8]
-; CHECK-NEXT:    sbfx w12, w9, #1, #1
-; CHECK-NEXT:    sbfx w9, w9, #0, #1
-; CHECK-NEXT:    strh w10, [sp, #6]
-; CHECK-NEXT:    strh w11, [sp, #4]
-; CHECK-NEXT:    strh w12, [sp, #2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    sbfx w9, w8, #15, #1
+; CHECK-NEXT:    sbfx w10, w8, #14, #1
+; CHECK-NEXT:    sbfx w11, w8, #13, #1
+; CHECK-NEXT:    strh w9, [sp, #30]
+; CHECK-NEXT:    sbfx w9, w8, #12, #1
+; CHECK-NEXT:    strh w10, [sp, #28]
+; CHECK-NEXT:    sbfx w10, w8, #11, #1
+; CHECK-NEXT:    strh w11, [sp, #26]
+; CHECK-NEXT:    sbfx w11, w8, #10, #1
+; CHECK-NEXT:    strh w9, [sp, #24]
+; CHECK-NEXT:    sbfx w9, w8, #9, #1
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    sbfx w10, w8, #8, #1
+; CHECK-NEXT:    strh w11, [sp, #20]
+; CHECK-NEXT:    sbfx w11, w8, #7, #1
+; CHECK-NEXT:    strh w9, [sp, #18]
+; CHECK-NEXT:    sbfx w9, w8, #6, #1
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    sbfx w10, w8, #5, #1
+; CHECK-NEXT:    strh w11, [sp, #14]
+; CHECK-NEXT:    sbfx w11, w8, #4, #1
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    sbfx w9, w8, #3, #1
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    sbfx w10, w8, #2, #1
+; CHECK-NEXT:    strh w11, [sp, #8]
+; CHECK-NEXT:    sbfx w11, w8, #1, #1
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    strh w9, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    strh w11, [sp, #2]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
@@ -119,75 +118,74 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldr w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldr w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    ptrue p1.h
-; VBITS_GE_512-NEXT:    asr w10, w9, #31
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #62]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #28, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #60]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #27, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #58]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #26, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #25, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #54]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #24, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #52]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #23, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #50]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #22, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #21, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #46]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #20, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #44]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #42]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #38]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #16, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #36]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #15, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #34]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #14, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #13, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #30]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #12, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #28]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #11, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #26]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #10, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #9, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #22]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #8, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #20]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #18]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #14]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #4, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #12]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #3, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #10]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #2, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #8]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #6]
-; VBITS_GE_512-NEXT:    strh w12, [sp, #4]
-; VBITS_GE_512-NEXT:    strh w10, [sp, #2]
-; VBITS_GE_512-NEXT:    strh w9, [sp]
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    asr w9, w8, #31
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #62]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #28, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #60]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #27, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #58]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #26, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #56]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #25, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #54]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #24, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #52]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #23, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #50]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #22, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #48]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #21, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #46]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #20, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #44]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #42]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #40]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #38]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #16, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #36]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #15, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #34]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #14, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #13, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #30]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #12, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #28]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #11, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #26]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #10, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #24]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #9, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #22]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #8, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #20]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #18]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #16]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #14]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #4, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #12]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #3, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #10]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #2, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #8]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #1, #1
+; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #6]
+; VBITS_GE_512-NEXT:    strh w11, [sp, #4]
+; VBITS_GE_512-NEXT:    strh w9, [sp, #2]
+; VBITS_GE_512-NEXT:    strh w8, [sp]
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
@@ -215,139 +213,138 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i1>* %c) #0 {
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldr x9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldr x8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
 ; VBITS_GE_1024-NEXT:    ptrue p1.h
-; VBITS_GE_1024-NEXT:    asr x10, x9, #63
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #126]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #124]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #122]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #118]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #116]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #114]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #110]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #108]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #106]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #102]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #100]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #98]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #94]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #92]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #90]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #86]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #84]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #82]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #78]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #76]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #74]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #70]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #68]
-; VBITS_GE_1024-NEXT:    asr w12, w9, #31
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #66]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #62]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #60]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #58]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #54]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #52]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #50]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #46]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #44]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #42]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #38]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #36]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #34]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #30]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #28]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #26]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #22]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #20]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #18]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #14]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #12]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #10]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #8]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #6]
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #4]
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #2]
-; VBITS_GE_1024-NEXT:    strh w9, [sp]
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    asr x9, x8, #63
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #126]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #124]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #122]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #120]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #118]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #116]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #114]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #112]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #110]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #108]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #106]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #104]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #102]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #100]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #98]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #96]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #94]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #92]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #90]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #88]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #86]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #84]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #82]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #80]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #78]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #76]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #74]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #72]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #70]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #68]
+; VBITS_GE_1024-NEXT:    asr w11, w8, #31
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #66]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #64]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #62]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #60]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #58]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #56]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #54]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #52]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #50]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #48]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #46]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #44]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #42]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #40]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #38]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #36]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #34]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #30]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #28]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #26]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #24]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #22]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #20]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #18]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #16]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #14]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #12]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #10]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #8]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #6]
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #4]
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #2]
+; VBITS_GE_1024-NEXT:    strh w8, [sp]
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.h, z0.h, #0x1
@@ -375,268 +372,267 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i1>* %c)
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr x9, [x2, #8]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr x8, [x2, #8]
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
 ; VBITS_GE_2048-NEXT:    ptrue p1.h
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #254]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #252]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #250]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #246]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #244]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #242]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #238]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #236]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #234]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #230]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #228]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #226]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #222]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #220]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #218]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #214]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #212]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #210]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #206]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #204]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #202]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #198]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #196]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #194]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #190]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #188]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #186]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #182]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #180]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #178]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #174]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #172]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #170]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #166]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #164]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #162]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #158]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #156]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #154]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #150]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #148]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #146]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #142]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #140]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #138]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #136]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #134]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #132]
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #130]
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #128]
-; VBITS_GE_2048-NEXT:    ldr x9, [x2]
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #126]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #124]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #122]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #118]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #116]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #114]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #110]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #108]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #106]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #102]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #100]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #98]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #94]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #92]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #90]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #86]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #84]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #82]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #78]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #76]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #74]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #70]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #68]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #66]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #62]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #60]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #58]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #54]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #52]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #50]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #46]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #44]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #42]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #38]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #36]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #34]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #30]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #28]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #26]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #22]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #20]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #18]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #14]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #12]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #10]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #8]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #6]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #4]
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #2]
-; VBITS_GE_2048-NEXT:    strh w9, [sp]
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #254]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #252]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #250]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #248]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #246]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #244]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #242]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #238]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #236]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #234]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #232]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #230]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #228]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #226]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #222]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #220]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #218]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #216]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #214]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #212]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #210]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #206]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #204]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #202]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #200]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #198]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #196]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #194]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #190]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #188]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #186]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #184]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #182]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #180]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #178]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #174]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #172]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #170]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #168]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #166]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #164]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #162]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #158]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #156]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #154]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #152]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #150]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #148]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #146]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #142]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #140]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #138]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #136]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #134]
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #132]
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #130]
+; VBITS_GE_2048-NEXT:    strh w8, [sp, #128]
+; VBITS_GE_2048-NEXT:    ldr x8, [x2]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #126]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #124]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #122]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #120]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #118]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #116]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #114]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #110]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #108]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #106]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #104]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #102]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #100]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #98]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #94]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #92]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #90]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #88]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #86]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #84]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #82]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #78]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #76]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #74]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #72]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #70]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #68]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #66]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #62]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #60]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #58]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #56]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #54]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #52]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #50]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #46]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #44]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #42]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #40]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #38]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #36]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #34]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #30]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #28]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #26]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #24]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #22]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #20]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #18]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #16]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #14]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #12]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #10]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #8]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #6]
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #4]
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #2]
+; VBITS_GE_2048-NEXT:    strh w8, [sp]
+; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.h, z0.h, #0x1
@@ -689,23 +685,22 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrb w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    sbfx w10, w9, #7, #1
-; CHECK-NEXT:    sbfx w11, w9, #6, #1
-; CHECK-NEXT:    sbfx w12, w9, #5, #1
-; CHECK-NEXT:    sbfx w13, w9, #4, #1
-; CHECK-NEXT:    stp w11, w10, [sp, #24]
-; CHECK-NEXT:    sbfx w10, w9, #3, #1
-; CHECK-NEXT:    sbfx w11, w9, #2, #1
-; CHECK-NEXT:    stp w13, w12, [sp, #16]
-; CHECK-NEXT:    sbfx w12, w9, #1, #1
-; CHECK-NEXT:    sbfx w9, w9, #0, #1
-; CHECK-NEXT:    stp w11, w10, [sp, #8]
-; CHECK-NEXT:    stp w9, w12, [sp]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    sbfx w9, w8, #7, #1
+; CHECK-NEXT:    sbfx w10, w8, #6, #1
+; CHECK-NEXT:    sbfx w11, w8, #5, #1
+; CHECK-NEXT:    sbfx w12, w8, #4, #1
+; CHECK-NEXT:    stp w10, w9, [sp, #24]
+; CHECK-NEXT:    sbfx w9, w8, #3, #1
+; CHECK-NEXT:    sbfx w10, w8, #2, #1
+; CHECK-NEXT:    stp w12, w11, [sp, #16]
+; CHECK-NEXT:    sbfx w11, w8, #1, #1
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    stp w10, w9, [sp, #8]
+; CHECK-NEXT:    stp w8, w11, [sp]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
@@ -733,35 +728,34 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i1>* %c) #0
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldrh w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldrh w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    ptrue p1.s
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_512-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_512-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.s, z0.s, #0x1
@@ -789,59 +783,58 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i1>* %c) #0
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldr w9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldr w8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
 ; VBITS_GE_1024-NEXT:    ptrue p1.s
-; VBITS_GE_1024-NEXT:    asr w10, w9, #31
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #28, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #24, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #23, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #22, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #21, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #20, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #16, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_1024-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    asr w9, w8, #31
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #28, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #120]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #112]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #24, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #104]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #23, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #22, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #96]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #21, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #20, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #88]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #80]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #16, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #72]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #64]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_1024-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.s, z0.s, #0x1
@@ -869,107 +862,106 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i1>* %c) #0
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr x9, [x2]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr x8, [x2]
 ; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
 ; VBITS_GE_2048-NEXT:    ptrue p1.s
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #60, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #59, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #58, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #57, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #56, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #55, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #54, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #53, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #52, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #48, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #47, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #46, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #45, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #44, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #43, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #42, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #41, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #40, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #36, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #35, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #34, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #33, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #32, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #136]
-; VBITS_GE_2048-NEXT:    asr w10, w9, #31
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #28, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #24, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #23, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #22, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #21, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #20, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #16, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_2048-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #60, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #248]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #59, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #58, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #57, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #56, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #232]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #55, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #54, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #53, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #52, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #216]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #48, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #200]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #47, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #46, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #45, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #44, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #184]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #43, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #42, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #41, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #40, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #168]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #36, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #152]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #35, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #34, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #33, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #32, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #136]
+; VBITS_GE_2048-NEXT:    asr w9, w8, #31
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #128]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #28, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #120]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #24, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #104]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #23, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #22, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #21, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #20, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #88]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #16, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #72]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_2048-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.s, z0.s, #0x1
@@ -1023,20 +1015,19 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrb w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    lsr w10, w9, #3
-; CHECK-NEXT:    lsr w11, w9, #2
-; CHECK-NEXT:    sbfx x12, x9, #0, #1
-; CHECK-NEXT:    lsr w9, w9, #1
-; CHECK-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NEXT:    sbfx x11, x11, #0, #1
+; CHECK-NEXT:    lsr w9, w8, #3
+; CHECK-NEXT:    lsr w10, w8, #2
+; CHECK-NEXT:    sbfx x11, x8, #0, #1
+; CHECK-NEXT:    lsr w8, w8, #1
 ; CHECK-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    stp x12, x9, [sp]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    stp x11, x8, [sp]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
@@ -1064,30 +1055,29 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldrb w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldrb w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ptrue p1.d
-; VBITS_GE_512-NEXT:    lsr w10, w9, #7
-; VBITS_GE_512-NEXT:    lsr w11, w9, #6
-; VBITS_GE_512-NEXT:    lsr w12, w9, #5
-; VBITS_GE_512-NEXT:    lsr w13, w9, #4
+; VBITS_GE_512-NEXT:    lsr w9, w8, #7
+; VBITS_GE_512-NEXT:    lsr w10, w8, #6
+; VBITS_GE_512-NEXT:    lsr w11, w8, #5
+; VBITS_GE_512-NEXT:    lsr w12, w8, #4
+; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_512-NEXT:    lsr w14, w9, #3
-; VBITS_GE_512-NEXT:    stp x11, x10, [sp, #48]
-; VBITS_GE_512-NEXT:    lsr w10, w9, #2
-; VBITS_GE_512-NEXT:    stp x13, x12, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #0, #1
-; VBITS_GE_512-NEXT:    lsr w9, w9, #1
-; VBITS_GE_512-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_512-NEXT:    lsr w13, w8, #3
+; VBITS_GE_512-NEXT:    stp x10, x9, [sp, #48]
+; VBITS_GE_512-NEXT:    lsr w9, w8, #2
+; VBITS_GE_512-NEXT:    stp x12, x11, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #0, #1
+; VBITS_GE_512-NEXT:    lsr w8, w8, #1
+; VBITS_GE_512-NEXT:    sbfx x10, x13, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT:    stp x10, x11, [sp, #16]
-; VBITS_GE_512-NEXT:    stp x12, x9, [sp]
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_512-NEXT:    stp x9, x10, [sp, #16]
+; VBITS_GE_512-NEXT:    stp x11, x8, [sp]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.d, z0.d, #0x1
@@ -1115,50 +1105,49 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i1>* %c)
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldrh w9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldrh w8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
 ; VBITS_GE_1024-NEXT:    ptrue p1.d
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #15
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #14
-; VBITS_GE_1024-NEXT:    lsr w12, w9, #13
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #12
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #15
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #14
+; VBITS_GE_1024-NEXT:    lsr w11, w8, #13
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #12
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w14, w9, #11
-; VBITS_GE_1024-NEXT:    lsr w15, w9, #10
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #112]
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #9
-; VBITS_GE_1024-NEXT:    stp x13, x12, [sp, #96]
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #8
+; VBITS_GE_1024-NEXT:    lsr w13, w8, #11
+; VBITS_GE_1024-NEXT:    lsr w14, w8, #10
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #112]
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #9
+; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #96]
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #8
+; VBITS_GE_1024-NEXT:    sbfx x10, x13, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x15, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w14, w9, #3
-; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #80]
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #6
-; VBITS_GE_1024-NEXT:    stp x13, x10, [sp, #64]
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #7
-; VBITS_GE_1024-NEXT:    lsr w12, w9, #5
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #4
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_1024-NEXT:    lsr w13, w8, #3
+; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #80]
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #6
+; VBITS_GE_1024-NEXT:    stp x12, x9, [sp, #64]
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #7
+; VBITS_GE_1024-NEXT:    lsr w11, w8, #5
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #4
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #48]
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #2
-; VBITS_GE_1024-NEXT:    stp x13, x12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w9, w9, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #16]
-; VBITS_GE_1024-NEXT:    stp x12, x9, [sp]
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #48]
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #2
+; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #0, #1
+; VBITS_GE_1024-NEXT:    lsr w8, w8, #1
+; VBITS_GE_1024-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #16]
+; VBITS_GE_1024-NEXT:    stp x11, x8, [sp]
+; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.d, z0.d, #0x1
@@ -1186,121 +1175,120 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i1>* %c)
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr w9, [x2]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr w8, [x2]
 ; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
 ; VBITS_GE_2048-NEXT:    ptrue p1.d
-; VBITS_GE_2048-NEXT:    ubfx x10, x9, #31, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #30, #2
+; VBITS_GE_2048-NEXT:    ubfx x9, x8, #31, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #30, #2
+; VBITS_GE_2048-NEXT:    // kill: def $w9 killed $w9 killed $x9 def $x9
 ; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #29, #3
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #28, #4
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #29, #3
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #28, #4
+; VBITS_GE_2048-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #27, #5
+; VBITS_GE_2048-NEXT:    ubfx x14, x8, #26, #6
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #27, #5
-; VBITS_GE_2048-NEXT:    ubfx x15, x9, #26, #6
 ; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    // kill: def $w15 killed $w15 killed $x15 def $x15
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x12, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x14, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #25, #7
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #23, #9
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x12, x10, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x10, x15, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #24, #8
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x10, x11, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #22, #10
-; VBITS_GE_2048-NEXT:    sbfx x13, x14, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #21, #11
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x12, x10, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x10, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #20, #12
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #19, #13
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #25, #7
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #23, #9
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x9, x14, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #24, #8
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #18, #14
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #22, #10
+; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #21, #11
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #17, #15
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #16, #16
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #15, #17
+; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #20, #12
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #19, #13
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #18, #14
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #14, #18
-; VBITS_GE_2048-NEXT:    sbfx x13, x13, #0, #1
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #13, #19
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #12, #20
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #11, #21
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #17, #15
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #16, #16
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #15, #17
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #10, #22
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #14, #18
 ; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #9, #23
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #8, #24
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #7, #25
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #13, #19
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #128]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #12, #20
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #11, #21
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #10, #22
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #6, #26
-; VBITS_GE_2048-NEXT:    sbfx x13, x13, #0, #1
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #5, #27
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #4, #28
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #3, #29
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #9, #23
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #8, #24
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #7, #25
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #2, #30
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #1, #31
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #6, #26
 ; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #5, #27
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #32]
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #4, #28
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #3, #29
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #2, #30
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #1, #31
+; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
+; VBITS_GE_2048-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #16]
-; VBITS_GE_2048-NEXT:    stp x9, x11, [sp]
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #16]
+; VBITS_GE_2048-NEXT:    stp x8, x10, [sp]
+; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.d, z0.d, #0x1

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index 4433a31797dbd..86cef17f1dd91 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -54,75 +54,74 @@ define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldr w8, [x2]
 ; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    asr w10, w9, #31
-; CHECK-NEXT:    sbfx w11, w9, #30, #1
-; CHECK-NEXT:    sbfx w12, w9, #29, #1
-; CHECK-NEXT:    strb w10, [sp, #31]
-; CHECK-NEXT:    sbfx w10, w9, #28, #1
-; CHECK-NEXT:    strb w11, [sp, #30]
-; CHECK-NEXT:    sbfx w11, w9, #27, #1
-; CHECK-NEXT:    strb w12, [sp, #29]
-; CHECK-NEXT:    sbfx w12, w9, #26, #1
-; CHECK-NEXT:    strb w10, [sp, #28]
-; CHECK-NEXT:    sbfx w10, w9, #25, #1
-; CHECK-NEXT:    strb w11, [sp, #27]
-; CHECK-NEXT:    sbfx w11, w9, #24, #1
-; CHECK-NEXT:    strb w12, [sp, #26]
-; CHECK-NEXT:    sbfx w12, w9, #23, #1
-; CHECK-NEXT:    strb w10, [sp, #25]
-; CHECK-NEXT:    sbfx w10, w9, #22, #1
-; CHECK-NEXT:    strb w11, [sp, #24]
-; CHECK-NEXT:    sbfx w11, w9, #21, #1
-; CHECK-NEXT:    strb w12, [sp, #23]
-; CHECK-NEXT:    sbfx w12, w9, #20, #1
-; CHECK-NEXT:    strb w10, [sp, #22]
-; CHECK-NEXT:    sbfx w10, w9, #19, #1
-; CHECK-NEXT:    strb w11, [sp, #21]
-; CHECK-NEXT:    sbfx w11, w9, #18, #1
-; CHECK-NEXT:    strb w12, [sp, #20]
-; CHECK-NEXT:    sbfx w12, w9, #17, #1
-; CHECK-NEXT:    strb w10, [sp, #19]
-; CHECK-NEXT:    sbfx w10, w9, #16, #1
-; CHECK-NEXT:    strb w11, [sp, #18]
-; CHECK-NEXT:    sbfx w11, w9, #15, #1
-; CHECK-NEXT:    strb w12, [sp, #17]
-; CHECK-NEXT:    sbfx w12, w9, #14, #1
-; CHECK-NEXT:    strb w10, [sp, #16]
-; CHECK-NEXT:    sbfx w10, w9, #13, #1
-; CHECK-NEXT:    strb w11, [sp, #15]
-; CHECK-NEXT:    sbfx w11, w9, #12, #1
-; CHECK-NEXT:    strb w12, [sp, #14]
-; CHECK-NEXT:    sbfx w12, w9, #11, #1
-; CHECK-NEXT:    strb w10, [sp, #13]
-; CHECK-NEXT:    sbfx w10, w9, #10, #1
-; CHECK-NEXT:    strb w11, [sp, #12]
-; CHECK-NEXT:    sbfx w11, w9, #9, #1
-; CHECK-NEXT:    strb w12, [sp, #11]
-; CHECK-NEXT:    sbfx w12, w9, #8, #1
-; CHECK-NEXT:    strb w10, [sp, #10]
-; CHECK-NEXT:    sbfx w10, w9, #7, #1
-; CHECK-NEXT:    strb w11, [sp, #9]
-; CHECK-NEXT:    sbfx w11, w9, #6, #1
-; CHECK-NEXT:    strb w12, [sp, #8]
-; CHECK-NEXT:    sbfx w12, w9, #5, #1
-; CHECK-NEXT:    strb w10, [sp, #7]
-; CHECK-NEXT:    sbfx w10, w9, #4, #1
-; CHECK-NEXT:    strb w11, [sp, #6]
-; CHECK-NEXT:    sbfx w11, w9, #3, #1
-; CHECK-NEXT:    strb w12, [sp, #5]
-; CHECK-NEXT:    sbfx w12, w9, #2, #1
-; CHECK-NEXT:    strb w10, [sp, #4]
-; CHECK-NEXT:    sbfx w10, w9, #1, #1
-; CHECK-NEXT:    sbfx w9, w9, #0, #1
-; CHECK-NEXT:    strb w11, [sp, #3]
-; CHECK-NEXT:    strb w12, [sp, #2]
-; CHECK-NEXT:    strb w10, [sp, #1]
-; CHECK-NEXT:    strb w9, [sp]
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    sbfx w10, w8, #30, #1
+; CHECK-NEXT:    sbfx w11, w8, #29, #1
+; CHECK-NEXT:    strb w9, [sp, #31]
+; CHECK-NEXT:    sbfx w9, w8, #28, #1
+; CHECK-NEXT:    strb w10, [sp, #30]
+; CHECK-NEXT:    sbfx w10, w8, #27, #1
+; CHECK-NEXT:    strb w11, [sp, #29]
+; CHECK-NEXT:    sbfx w11, w8, #26, #1
+; CHECK-NEXT:    strb w9, [sp, #28]
+; CHECK-NEXT:    sbfx w9, w8, #25, #1
+; CHECK-NEXT:    strb w10, [sp, #27]
+; CHECK-NEXT:    sbfx w10, w8, #24, #1
+; CHECK-NEXT:    strb w11, [sp, #26]
+; CHECK-NEXT:    sbfx w11, w8, #23, #1
+; CHECK-NEXT:    strb w9, [sp, #25]
+; CHECK-NEXT:    sbfx w9, w8, #22, #1
+; CHECK-NEXT:    strb w10, [sp, #24]
+; CHECK-NEXT:    sbfx w10, w8, #21, #1
+; CHECK-NEXT:    strb w11, [sp, #23]
+; CHECK-NEXT:    sbfx w11, w8, #20, #1
+; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    sbfx w9, w8, #19, #1
+; CHECK-NEXT:    strb w10, [sp, #21]
+; CHECK-NEXT:    sbfx w10, w8, #18, #1
+; CHECK-NEXT:    strb w11, [sp, #20]
+; CHECK-NEXT:    sbfx w11, w8, #17, #1
+; CHECK-NEXT:    strb w9, [sp, #19]
+; CHECK-NEXT:    sbfx w9, w8, #16, #1
+; CHECK-NEXT:    strb w10, [sp, #18]
+; CHECK-NEXT:    sbfx w10, w8, #15, #1
+; CHECK-NEXT:    strb w11, [sp, #17]
+; CHECK-NEXT:    sbfx w11, w8, #14, #1
+; CHECK-NEXT:    strb w9, [sp, #16]
+; CHECK-NEXT:    sbfx w9, w8, #13, #1
+; CHECK-NEXT:    strb w10, [sp, #15]
+; CHECK-NEXT:    sbfx w10, w8, #12, #1
+; CHECK-NEXT:    strb w11, [sp, #14]
+; CHECK-NEXT:    sbfx w11, w8, #11, #1
+; CHECK-NEXT:    strb w9, [sp, #13]
+; CHECK-NEXT:    sbfx w9, w8, #10, #1
+; CHECK-NEXT:    strb w10, [sp, #12]
+; CHECK-NEXT:    sbfx w10, w8, #9, #1
+; CHECK-NEXT:    strb w11, [sp, #11]
+; CHECK-NEXT:    sbfx w11, w8, #8, #1
+; CHECK-NEXT:    strb w9, [sp, #10]
+; CHECK-NEXT:    sbfx w9, w8, #7, #1
+; CHECK-NEXT:    strb w10, [sp, #9]
+; CHECK-NEXT:    sbfx w10, w8, #6, #1
+; CHECK-NEXT:    strb w11, [sp, #8]
+; CHECK-NEXT:    sbfx w11, w8, #5, #1
+; CHECK-NEXT:    strb w9, [sp, #7]
+; CHECK-NEXT:    sbfx w9, w8, #4, #1
+; CHECK-NEXT:    strb w10, [sp, #6]
+; CHECK-NEXT:    sbfx w10, w8, #3, #1
+; CHECK-NEXT:    strb w11, [sp, #5]
+; CHECK-NEXT:    sbfx w11, w8, #2, #1
+; CHECK-NEXT:    strb w9, [sp, #4]
+; CHECK-NEXT:    sbfx w9, w8, #1, #1
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    strb w10, [sp, #3]
+; CHECK-NEXT:    strb w11, [sp, #2]
+; CHECK-NEXT:    strb w9, [sp, #1]
+; CHECK-NEXT:    strb w8, [sp]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
@@ -150,139 +149,138 @@ define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldr x9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldr x8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    ptrue p1.b
-; VBITS_GE_512-NEXT:    asr x10, x9, #63
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #63]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #62]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #61]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #60]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #59]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #58]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #57]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #55]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #54]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #53]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #52]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #51]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #50]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #49]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #47]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #46]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #45]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #44]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #43]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #42]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #41]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #39]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #38]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #37]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #36]
-; VBITS_GE_512-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #35]
-; VBITS_GE_512-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #34]
-; VBITS_GE_512-NEXT:    asr w12, w9, #31
-; VBITS_GE_512-NEXT:    strb w10, [sp, #33]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #31]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #30]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #29]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #28]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #27]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #26]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #25]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #23]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #22]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #21]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #20]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #19]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #18]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #17]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #15]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #14]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #13]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #12]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #11]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #10]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #9]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #8]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #7]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #6]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_512-NEXT:    strb w11, [sp, #5]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_512-NEXT:    strb w12, [sp, #4]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_512-NEXT:    strb w10, [sp, #3]
-; VBITS_GE_512-NEXT:    strb w11, [sp, #2]
-; VBITS_GE_512-NEXT:    strb w12, [sp, #1]
-; VBITS_GE_512-NEXT:    strb w9, [sp]
-; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    asr x9, x8, #63
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #63]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #62]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #61]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #60]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #59]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #58]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #57]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #56]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #55]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #54]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #53]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #52]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #51]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #50]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #49]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #48]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #47]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #46]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #45]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #44]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #43]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #42]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #41]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #40]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #39]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #38]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #37]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #36]
+; VBITS_GE_512-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #35]
+; VBITS_GE_512-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #34]
+; VBITS_GE_512-NEXT:    asr w11, w8, #31
+; VBITS_GE_512-NEXT:    strb w9, [sp, #33]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #31]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #30]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #29]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #28]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #27]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #26]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #25]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #24]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #23]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #22]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #21]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #20]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #19]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #18]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #17]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #16]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #15]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #14]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #13]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #12]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #11]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #10]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #9]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #8]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #7]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #6]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_512-NEXT:    strb w10, [sp, #5]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_512-NEXT:    strb w11, [sp, #4]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_512-NEXT:    strb w9, [sp, #3]
+; VBITS_GE_512-NEXT:    strb w10, [sp, #2]
+; VBITS_GE_512-NEXT:    strb w11, [sp, #1]
+; VBITS_GE_512-NEXT:    strb w8, [sp]
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1b { z2.b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.b, z0.b, #0x1
@@ -310,268 +308,267 @@ define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 {
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldr x9, [x2, #8]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldr x8, [x2, #8]
 ; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
 ; VBITS_GE_1024-NEXT:    ptrue p1.b
-; VBITS_GE_1024-NEXT:    asr x10, x9, #63
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #127]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #126]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #125]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #124]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #123]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #122]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #121]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #119]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #118]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #117]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #116]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #115]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #114]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #113]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #111]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #110]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #109]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #108]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #107]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #106]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #105]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #103]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #102]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #101]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #100]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #99]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #98]
-; VBITS_GE_1024-NEXT:    asr w12, w9, #31
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #97]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #95]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #94]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #93]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #92]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #91]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #90]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #89]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #87]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #86]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #85]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #84]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #83]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #82]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #81]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #79]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #78]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #77]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #76]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #75]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #74]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #73]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #71]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #70]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #69]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #68]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #67]
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #66]
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #65]
-; VBITS_GE_1024-NEXT:    strb w9, [sp, #64]
-; VBITS_GE_1024-NEXT:    ldr x9, [x2]
-; VBITS_GE_1024-NEXT:    asr x10, x9, #63
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #63]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #62]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #61]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #60]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #59]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #58]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #57]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #55]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #54]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #53]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #52]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #51]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #50]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #49]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #47]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #46]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #45]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #44]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #43]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #42]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #41]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #39]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #38]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #37]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #36]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #35]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #34]
-; VBITS_GE_1024-NEXT:    asr w12, w9, #31
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #33]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #31]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #30]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #29]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #28]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #27]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #26]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #25]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #23]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #22]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #21]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #20]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #19]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #18]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #17]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #15]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #14]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #13]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #12]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #11]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #10]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #9]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #8]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #7]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #6]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #5]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #4]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    strb w10, [sp, #3]
-; VBITS_GE_1024-NEXT:    strb w11, [sp, #2]
-; VBITS_GE_1024-NEXT:    strb w12, [sp, #1]
-; VBITS_GE_1024-NEXT:    strb w9, [sp]
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    asr x9, x8, #63
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #127]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #126]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #125]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #124]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #123]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #122]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #121]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #120]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #119]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #118]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #117]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #116]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #115]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #114]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #113]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #112]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #111]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #110]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #109]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #108]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #107]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #106]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #105]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #104]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #103]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #102]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #101]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #100]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #99]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #98]
+; VBITS_GE_1024-NEXT:    asr w11, w8, #31
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #97]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #96]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #95]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #94]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #93]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #92]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #91]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #90]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #89]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #88]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #87]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #86]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #85]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #84]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #83]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #82]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #81]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #80]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #79]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #78]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #77]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #76]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #75]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #74]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #73]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #72]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #71]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #70]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #69]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #68]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #67]
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #66]
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #65]
+; VBITS_GE_1024-NEXT:    strb w8, [sp, #64]
+; VBITS_GE_1024-NEXT:    ldr x8, [x2]
+; VBITS_GE_1024-NEXT:    asr x9, x8, #63
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #63]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #62]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #61]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #60]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #59]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #58]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #57]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #56]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #55]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #54]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #53]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #52]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #51]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #50]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #49]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #48]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #47]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #46]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #45]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #44]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #43]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #42]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #41]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #40]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #39]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #38]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #37]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #36]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #35]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #34]
+; VBITS_GE_1024-NEXT:    asr w11, w8, #31
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #33]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #31]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #30]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #29]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #28]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #27]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #26]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #25]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #24]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #23]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #22]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #21]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #20]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #19]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #18]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #17]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #16]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #15]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #14]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #13]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #12]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #11]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #10]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #9]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #8]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #7]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #6]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #5]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #4]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    strb w9, [sp, #3]
+; VBITS_GE_1024-NEXT:    strb w10, [sp, #2]
+; VBITS_GE_1024-NEXT:    strb w11, [sp, #1]
+; VBITS_GE_1024-NEXT:    strb w8, [sp]
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1b { z2.b }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.b, z0.b, #0x1
@@ -599,526 +596,525 @@ define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 {
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr x9, [x2, #24]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr x8, [x2, #24]
 ; VBITS_GE_2048-NEXT:    ptrue p0.b, vl256
 ; VBITS_GE_2048-NEXT:    ptrue p1.b
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #255]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #254]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #253]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #252]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #251]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #250]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #249]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #247]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #246]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #245]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #244]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #243]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #242]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #241]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #239]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #238]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #237]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #236]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #235]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #234]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #233]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #231]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #230]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #229]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #228]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #227]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #226]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #225]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #223]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #222]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #221]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #220]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #219]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #218]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #217]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #215]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #214]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #213]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #212]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #211]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #210]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #209]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #207]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #206]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #205]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #204]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #203]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #202]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #201]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #199]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #198]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #197]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #196]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #195]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #194]
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #193]
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #192]
-; VBITS_GE_2048-NEXT:    ldr x9, [x2, #16]
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #191]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #190]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #189]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #188]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #187]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #186]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #185]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #183]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #182]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #181]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #180]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #179]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #178]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #177]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #175]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #174]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #173]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #172]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #171]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #170]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #169]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #167]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #166]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #165]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #164]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #163]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #162]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #161]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #159]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #158]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #157]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #156]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #155]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #154]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #153]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #151]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #150]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #149]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #148]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #147]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #146]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #145]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #143]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #142]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #141]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #140]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #139]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #138]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #137]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #136]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #135]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #134]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #133]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #132]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #131]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #130]
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #129]
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #128]
-; VBITS_GE_2048-NEXT:    ldr x9, [x2, #8]
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #127]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #126]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #125]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #124]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #123]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #122]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #121]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #119]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #118]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #117]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #116]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #115]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #114]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #113]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #111]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #110]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #109]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #108]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #107]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #106]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #105]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #103]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #102]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #101]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #100]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #99]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #98]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #97]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #95]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #94]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #93]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #92]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #91]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #90]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #89]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #87]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #86]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #85]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #84]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #83]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #82]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #81]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #79]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #78]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #77]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #76]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #75]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #74]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #73]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #71]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #70]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #69]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #68]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #67]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #66]
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #65]
-; VBITS_GE_2048-NEXT:    strb w9, [sp, #64]
-; VBITS_GE_2048-NEXT:    ldr x9, [x2]
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #63]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #62]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #61]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #60]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #59]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #58]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #57]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #55]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #54]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #53]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #52]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #51]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #50]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #49]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #47]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #46]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #45]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #44]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #43]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #42]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #41]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #39]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #38]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #37]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #36]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #35]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #34]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #33]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #31]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #30]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #29]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #28]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #27]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #26]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #25]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #23]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #22]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #21]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #20]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #19]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #18]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #17]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #15]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #14]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #13]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #12]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #11]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #10]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #9]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #8]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #7]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #6]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #5]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #4]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strb w10, [sp, #3]
-; VBITS_GE_2048-NEXT:    strb w11, [sp, #2]
-; VBITS_GE_2048-NEXT:    strb w12, [sp, #1]
-; VBITS_GE_2048-NEXT:    strb w9, [sp]
-; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #255]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #254]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #253]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #252]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #251]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #250]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #249]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #248]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #247]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #246]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #245]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #244]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #243]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #242]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #241]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #239]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #238]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #237]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #236]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #235]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #234]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #233]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #232]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #231]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #230]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #229]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #228]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #227]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #226]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #225]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #223]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #222]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #221]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #220]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #219]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #218]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #217]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #216]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #215]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #214]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #213]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #212]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #211]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #210]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #209]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #207]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #206]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #205]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #204]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #203]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #202]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #201]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #200]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #199]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #198]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #197]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #196]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #195]
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #194]
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #193]
+; VBITS_GE_2048-NEXT:    strb w8, [sp, #192]
+; VBITS_GE_2048-NEXT:    ldr x8, [x2, #16]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #191]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #190]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #189]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #188]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #187]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #186]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #185]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #184]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #183]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #182]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #181]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #180]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #179]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #178]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #177]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #175]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #174]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #173]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #172]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #171]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #170]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #169]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #168]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #167]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #166]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #165]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #164]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #163]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #162]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #161]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #159]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #158]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #157]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #156]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #155]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #154]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #153]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #152]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #151]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #150]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #149]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #148]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #147]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #146]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #145]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #143]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #142]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #141]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #140]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #139]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #138]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #137]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #136]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #135]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #134]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #133]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #132]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #131]
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #130]
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #129]
+; VBITS_GE_2048-NEXT:    strb w8, [sp, #128]
+; VBITS_GE_2048-NEXT:    ldr x8, [x2, #8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #127]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #126]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #125]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #124]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #123]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #122]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #121]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #120]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #119]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #118]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #117]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #116]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #115]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #114]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #113]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #111]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #110]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #109]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #108]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #107]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #106]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #105]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #104]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #103]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #102]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #101]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #100]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #99]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #98]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #97]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #95]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #94]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #93]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #92]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #91]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #90]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #89]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #88]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #87]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #86]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #85]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #84]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #83]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #82]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #81]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #79]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #78]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #77]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #76]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #75]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #74]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #73]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #72]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #71]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #70]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #69]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #68]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #67]
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #66]
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #65]
+; VBITS_GE_2048-NEXT:    strb w8, [sp, #64]
+; VBITS_GE_2048-NEXT:    ldr x8, [x2]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #63]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #62]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #61]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #60]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #59]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #58]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #57]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #56]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #55]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #54]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #53]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #52]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #51]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #50]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #49]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #47]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #46]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #45]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #44]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #43]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #42]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #41]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #40]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #39]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #38]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #37]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #36]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #35]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #34]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #33]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #31]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #30]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #29]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #28]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #27]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #26]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #25]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #24]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #23]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #22]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #21]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #20]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #19]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #18]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #17]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #16]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #15]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #14]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #13]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #12]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #11]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #10]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #9]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #8]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #7]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #6]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #5]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #4]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strb w9, [sp, #3]
+; VBITS_GE_2048-NEXT:    strb w10, [sp, #2]
+; VBITS_GE_2048-NEXT:    strb w11, [sp, #1]
+; VBITS_GE_2048-NEXT:    strb w8, [sp]
+; VBITS_GE_2048-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1b { z2.b }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.b, z0.b, #0x1
@@ -1171,43 +1167,42 @@ define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrh w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrh w8, [x2]
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    sbfx w10, w9, #15, #1
-; CHECK-NEXT:    sbfx w11, w9, #14, #1
-; CHECK-NEXT:    sbfx w12, w9, #13, #1
-; CHECK-NEXT:    strh w10, [sp, #30]
-; CHECK-NEXT:    sbfx w10, w9, #12, #1
-; CHECK-NEXT:    strh w11, [sp, #28]
-; CHECK-NEXT:    sbfx w11, w9, #11, #1
-; CHECK-NEXT:    strh w12, [sp, #26]
-; CHECK-NEXT:    sbfx w12, w9, #10, #1
-; CHECK-NEXT:    strh w10, [sp, #24]
-; CHECK-NEXT:    sbfx w10, w9, #9, #1
-; CHECK-NEXT:    strh w11, [sp, #22]
-; CHECK-NEXT:    sbfx w11, w9, #8, #1
-; CHECK-NEXT:    strh w12, [sp, #20]
-; CHECK-NEXT:    sbfx w12, w9, #7, #1
-; CHECK-NEXT:    strh w10, [sp, #18]
-; CHECK-NEXT:    sbfx w10, w9, #6, #1
-; CHECK-NEXT:    strh w11, [sp, #16]
-; CHECK-NEXT:    sbfx w11, w9, #5, #1
-; CHECK-NEXT:    strh w12, [sp, #14]
-; CHECK-NEXT:    sbfx w12, w9, #4, #1
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    sbfx w10, w9, #3, #1
-; CHECK-NEXT:    strh w11, [sp, #10]
-; CHECK-NEXT:    sbfx w11, w9, #2, #1
-; CHECK-NEXT:    strh w12, [sp, #8]
-; CHECK-NEXT:    sbfx w12, w9, #1, #1
-; CHECK-NEXT:    sbfx w9, w9, #0, #1
-; CHECK-NEXT:    strh w10, [sp, #6]
-; CHECK-NEXT:    strh w11, [sp, #4]
-; CHECK-NEXT:    strh w12, [sp, #2]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    sbfx w9, w8, #15, #1
+; CHECK-NEXT:    sbfx w10, w8, #14, #1
+; CHECK-NEXT:    sbfx w11, w8, #13, #1
+; CHECK-NEXT:    strh w9, [sp, #30]
+; CHECK-NEXT:    sbfx w9, w8, #12, #1
+; CHECK-NEXT:    strh w10, [sp, #28]
+; CHECK-NEXT:    sbfx w10, w8, #11, #1
+; CHECK-NEXT:    strh w11, [sp, #26]
+; CHECK-NEXT:    sbfx w11, w8, #10, #1
+; CHECK-NEXT:    strh w9, [sp, #24]
+; CHECK-NEXT:    sbfx w9, w8, #9, #1
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    sbfx w10, w8, #8, #1
+; CHECK-NEXT:    strh w11, [sp, #20]
+; CHECK-NEXT:    sbfx w11, w8, #7, #1
+; CHECK-NEXT:    strh w9, [sp, #18]
+; CHECK-NEXT:    sbfx w9, w8, #6, #1
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    sbfx w10, w8, #5, #1
+; CHECK-NEXT:    strh w11, [sp, #14]
+; CHECK-NEXT:    sbfx w11, w8, #4, #1
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    sbfx w9, w8, #3, #1
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    sbfx w10, w8, #2, #1
+; CHECK-NEXT:    strh w11, [sp, #8]
+; CHECK-NEXT:    sbfx w11, w8, #1, #1
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    strh w9, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    strh w11, [sp, #2]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
@@ -1235,75 +1230,74 @@ define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldr w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldr w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    ptrue p1.h
-; VBITS_GE_512-NEXT:    asr w10, w9, #31
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #62]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #28, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #60]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #27, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #58]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #26, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #25, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #54]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #24, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #52]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #23, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #50]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #22, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #21, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #46]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #20, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #44]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #42]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #38]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #16, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #36]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #15, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #34]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #14, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #13, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #30]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #12, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #28]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #11, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #26]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #10, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #9, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #22]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #8, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #20]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #18]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #14]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #4, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #12]
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #3, #1
-; VBITS_GE_512-NEXT:    strh w12, [sp, #10]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #2, #1
-; VBITS_GE_512-NEXT:    strh w10, [sp, #8]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_512-NEXT:    strh w11, [sp, #6]
-; VBITS_GE_512-NEXT:    strh w12, [sp, #4]
-; VBITS_GE_512-NEXT:    strh w10, [sp, #2]
-; VBITS_GE_512-NEXT:    strh w9, [sp]
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    asr w9, w8, #31
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #62]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #28, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #60]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #27, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #58]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #26, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #56]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #25, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #54]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #24, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #52]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #23, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #50]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #22, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #48]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #21, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #46]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #20, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #44]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #42]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #40]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #38]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #16, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #36]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #15, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #34]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #14, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #13, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #30]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #12, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #28]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #11, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #26]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #10, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #24]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #9, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #22]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #8, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #20]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #18]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #16]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #14]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #4, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #12]
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #3, #1
+; VBITS_GE_512-NEXT:    strh w11, [sp, #10]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #2, #1
+; VBITS_GE_512-NEXT:    strh w9, [sp, #8]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #1, #1
+; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_512-NEXT:    strh w10, [sp, #6]
+; VBITS_GE_512-NEXT:    strh w11, [sp, #4]
+; VBITS_GE_512-NEXT:    strh w9, [sp, #2]
+; VBITS_GE_512-NEXT:    strh w8, [sp]
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
@@ -1331,139 +1325,138 @@ define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 {
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldr x9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldr x8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
 ; VBITS_GE_1024-NEXT:    ptrue p1.h
-; VBITS_GE_1024-NEXT:    asr x10, x9, #63
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #126]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #124]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #122]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #118]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #116]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #114]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #110]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #108]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #106]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #102]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #100]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #98]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #94]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #92]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #90]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #86]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #84]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #82]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #78]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #76]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #74]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #70]
-; VBITS_GE_1024-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #68]
-; VBITS_GE_1024-NEXT:    asr w12, w9, #31
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #66]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #62]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #60]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #58]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #54]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #52]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #50]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #46]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #44]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #42]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #38]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #36]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #34]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #30]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #28]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #26]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #22]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #20]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #18]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #14]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #12]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #10]
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #8]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    strh w10, [sp, #6]
-; VBITS_GE_1024-NEXT:    strh w11, [sp, #4]
-; VBITS_GE_1024-NEXT:    strh w12, [sp, #2]
-; VBITS_GE_1024-NEXT:    strh w9, [sp]
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    asr x9, x8, #63
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #126]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #124]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #122]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #120]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #118]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #116]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #114]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #112]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #110]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #108]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #106]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #104]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #102]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #100]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #98]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #96]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #94]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #92]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #90]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #88]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #86]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #84]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #82]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #80]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #78]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #76]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #74]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #72]
+; VBITS_GE_1024-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #70]
+; VBITS_GE_1024-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #68]
+; VBITS_GE_1024-NEXT:    asr w11, w8, #31
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #66]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #64]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #62]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #60]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #58]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #56]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #54]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #52]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #50]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #48]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #46]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #44]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #42]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #40]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #38]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #36]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #34]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #30]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #28]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #26]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #24]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #22]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #20]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #18]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #16]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #14]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #12]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #10]
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #8]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    strh w9, [sp, #6]
+; VBITS_GE_1024-NEXT:    strh w10, [sp, #4]
+; VBITS_GE_1024-NEXT:    strh w11, [sp, #2]
+; VBITS_GE_1024-NEXT:    strh w8, [sp]
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.h, z0.h, #0x1
@@ -1491,268 +1484,267 @@ define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr x9, [x2, #8]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr x8, [x2, #8]
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
 ; VBITS_GE_2048-NEXT:    ptrue p1.h
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #254]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #252]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #250]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #246]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #244]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #242]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #238]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #236]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #234]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #230]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #228]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #226]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #222]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #220]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #218]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #214]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #212]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #210]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #206]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #204]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #202]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #198]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #196]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #194]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #190]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #188]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #186]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #182]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #180]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #178]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #174]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #172]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #170]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #166]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #164]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #162]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #158]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #156]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #154]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #150]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #148]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #146]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #142]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #140]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #138]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #136]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #134]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #132]
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #130]
-; VBITS_GE_2048-NEXT:    strh w9, [sp, #128]
-; VBITS_GE_2048-NEXT:    ldr x9, [x2]
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #126]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #60, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #124]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #59, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #122]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #58, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #57, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #118]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #56, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #116]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #55, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #114]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #54, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #53, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #110]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #52, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #108]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #106]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #102]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #48, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #100]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #47, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #98]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #46, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #45, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #94]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #44, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #92]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #43, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #90]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #42, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #41, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #86]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #40, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #84]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #82]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #78]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #36, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #76]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #35, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #74]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #34, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #33, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #70]
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #32, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #68]
-; VBITS_GE_2048-NEXT:    asr w12, w9, #31
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #66]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #30, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #29, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #62]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #28, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #60]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #58]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #54]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #24, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #52]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #23, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #50]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #22, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #21, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #46]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #20, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #44]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #19, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #42]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #18, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #17, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #38]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #16, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #36]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #34]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #30]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #12, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #28]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #11, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #26]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #10, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #9, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #22]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #8, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #20]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #7, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #18]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #6, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #5, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #14]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #4, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #12]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #10]
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #8]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    strh w10, [sp, #6]
-; VBITS_GE_2048-NEXT:    strh w11, [sp, #4]
-; VBITS_GE_2048-NEXT:    strh w12, [sp, #2]
-; VBITS_GE_2048-NEXT:    strh w9, [sp]
-; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #254]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #252]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #250]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #248]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #246]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #244]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #242]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #238]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #236]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #234]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #232]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #230]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #228]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #226]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #222]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #220]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #218]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #216]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #214]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #212]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #210]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #206]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #204]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #202]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #200]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #198]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #196]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #194]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #190]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #188]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #186]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #184]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #182]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #180]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #178]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #174]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #172]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #170]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #168]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #166]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #164]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #162]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #158]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #156]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #154]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #152]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #150]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #148]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #146]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #142]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #140]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #138]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #136]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #134]
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #132]
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #130]
+; VBITS_GE_2048-NEXT:    strh w8, [sp, #128]
+; VBITS_GE_2048-NEXT:    ldr x8, [x2]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #126]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #60, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #124]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #59, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #122]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #58, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #120]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #57, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #118]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #56, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #116]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #55, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #114]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #54, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #53, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #110]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #52, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #108]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #106]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #104]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #102]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #48, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #100]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #47, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #98]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #46, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #45, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #94]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #44, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #92]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #43, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #90]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #42, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #88]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #41, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #86]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #40, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #84]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #82]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #78]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #36, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #76]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #35, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #74]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #34, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #72]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #33, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #70]
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #32, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #68]
+; VBITS_GE_2048-NEXT:    asr w11, w8, #31
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #66]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #30, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #29, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #62]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #28, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #60]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #58]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #56]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #54]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #24, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #52]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #23, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #50]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #22, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #21, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #46]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #20, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #44]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #19, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #42]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #18, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #40]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #17, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #38]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #16, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #36]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #34]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #30]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #12, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #28]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #11, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #26]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #10, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #24]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #9, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #22]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #8, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #20]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #7, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #18]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #6, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #16]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #5, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #14]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #4, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #12]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #10]
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #8]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    strh w9, [sp, #6]
+; VBITS_GE_2048-NEXT:    strh w10, [sp, #4]
+; VBITS_GE_2048-NEXT:    strh w11, [sp, #2]
+; VBITS_GE_2048-NEXT:    strh w8, [sp]
+; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1h { z2.h }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.h, z0.h, #0x1
@@ -1805,23 +1797,22 @@ define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrb w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    sbfx w10, w9, #7, #1
-; CHECK-NEXT:    sbfx w11, w9, #6, #1
-; CHECK-NEXT:    sbfx w12, w9, #5, #1
-; CHECK-NEXT:    sbfx w13, w9, #4, #1
-; CHECK-NEXT:    stp w11, w10, [sp, #24]
-; CHECK-NEXT:    sbfx w10, w9, #3, #1
-; CHECK-NEXT:    sbfx w11, w9, #2, #1
-; CHECK-NEXT:    stp w13, w12, [sp, #16]
-; CHECK-NEXT:    sbfx w12, w9, #1, #1
-; CHECK-NEXT:    sbfx w9, w9, #0, #1
-; CHECK-NEXT:    stp w11, w10, [sp, #8]
-; CHECK-NEXT:    stp w9, w12, [sp]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    sbfx w9, w8, #7, #1
+; CHECK-NEXT:    sbfx w10, w8, #6, #1
+; CHECK-NEXT:    sbfx w11, w8, #5, #1
+; CHECK-NEXT:    sbfx w12, w8, #4, #1
+; CHECK-NEXT:    stp w10, w9, [sp, #24]
+; CHECK-NEXT:    sbfx w9, w8, #3, #1
+; CHECK-NEXT:    sbfx w10, w8, #2, #1
+; CHECK-NEXT:    stp w12, w11, [sp, #16]
+; CHECK-NEXT:    sbfx w11, w8, #1, #1
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    stp w10, w9, [sp, #8]
+; CHECK-NEXT:    stp w8, w11, [sp]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
@@ -1849,35 +1840,34 @@ define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldrh w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldrh w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    ptrue p1.s
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_512-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_512-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_512-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_512-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_512-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_512-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_512-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_512-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_512-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_512-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_512-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_512-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_512-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_512-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_512-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_512-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.s, z0.s, #0x1
@@ -1905,59 +1895,58 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 {
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldr w9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldr w8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
 ; VBITS_GE_1024-NEXT:    ptrue p1.s
-; VBITS_GE_1024-NEXT:    asr w10, w9, #31
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #28, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #120]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #112]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #24, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #104]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #23, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #22, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #96]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #21, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #20, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #88]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #80]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #16, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #72]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #64]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_1024-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_1024-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_1024-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_1024-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_1024-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_1024-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_1024-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_1024-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    asr w9, w8, #31
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #28, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #120]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #112]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #24, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #104]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #23, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #22, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #96]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #21, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #20, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #88]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #80]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #16, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #72]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #64]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_1024-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_1024-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_1024-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_1024-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_1024-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_1024-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_1024-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_1024-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.s, z0.s, #0x1
@@ -1985,107 +1974,106 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 {
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr x9, [x2]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr x8, [x2]
 ; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
 ; VBITS_GE_2048-NEXT:    ptrue p1.s
-; VBITS_GE_2048-NEXT:    asr x10, x9, #63
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #62, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #61, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #60, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #248]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #59, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #58, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #57, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #56, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #232]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #55, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #54, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #53, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #52, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #216]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #51, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #50, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #49, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #48, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #200]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #47, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #46, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #45, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #44, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #184]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #43, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #42, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #41, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #40, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #168]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #39, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #38, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #37, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #36, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #152]
-; VBITS_GE_2048-NEXT:    sbfx x10, x9, #35, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x9, #34, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x12, x9, #33, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x9, #32, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #136]
-; VBITS_GE_2048-NEXT:    asr w10, w9, #31
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #30, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #29, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #28, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #120]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #27, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #26, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #25, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #24, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #104]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #23, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #22, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #21, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #20, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #88]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #19, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #18, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #17, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #16, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #72]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #15, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #14, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #13, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #12, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #56]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #11, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #10, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #9, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #8, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #40]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #7, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #6, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #32]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #5, #1
-; VBITS_GE_2048-NEXT:    sbfx w13, w9, #4, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #24]
-; VBITS_GE_2048-NEXT:    sbfx w10, w9, #3, #1
-; VBITS_GE_2048-NEXT:    sbfx w11, w9, #2, #1
-; VBITS_GE_2048-NEXT:    stp w13, w12, [sp, #16]
-; VBITS_GE_2048-NEXT:    sbfx w12, w9, #1, #1
-; VBITS_GE_2048-NEXT:    sbfx w9, w9, #0, #1
-; VBITS_GE_2048-NEXT:    stp w11, w10, [sp, #8]
-; VBITS_GE_2048-NEXT:    stp w9, w12, [sp]
-; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    asr x9, x8, #63
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #62, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #61, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #60, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #248]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #59, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #58, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #57, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #56, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #232]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #55, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #54, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #53, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #52, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #216]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #51, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #50, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #49, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #48, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #200]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #47, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #46, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #45, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #44, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #184]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #43, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #42, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #41, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #40, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #168]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #39, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #38, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #37, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #36, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #152]
+; VBITS_GE_2048-NEXT:    sbfx x9, x8, #35, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x8, #34, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx x11, x8, #33, #1
+; VBITS_GE_2048-NEXT:    sbfx x12, x8, #32, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #136]
+; VBITS_GE_2048-NEXT:    asr w9, w8, #31
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #30, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #128]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #29, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #28, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #120]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #27, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #26, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #25, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #24, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #104]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #23, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #22, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #21, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #20, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #88]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #19, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #18, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #17, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #16, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #72]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #15, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #14, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #13, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #12, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #56]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #11, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #10, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #9, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #8, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #40]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #7, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #6, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #5, #1
+; VBITS_GE_2048-NEXT:    sbfx w12, w8, #4, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #24]
+; VBITS_GE_2048-NEXT:    sbfx w9, w8, #3, #1
+; VBITS_GE_2048-NEXT:    sbfx w10, w8, #2, #1
+; VBITS_GE_2048-NEXT:    stp w12, w11, [sp, #16]
+; VBITS_GE_2048-NEXT:    sbfx w11, w8, #1, #1
+; VBITS_GE_2048-NEXT:    sbfx w8, w8, #0, #1
+; VBITS_GE_2048-NEXT:    stp w10, w9, [sp, #8]
+; VBITS_GE_2048-NEXT:    stp w8, w11, [sp]
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.s, z0.s, #0x1
@@ -2139,20 +2127,19 @@ define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrb w9, [x2]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ldrb w8, [x2]
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    lsr w10, w9, #3
-; CHECK-NEXT:    lsr w11, w9, #2
-; CHECK-NEXT:    sbfx x12, x9, #0, #1
-; CHECK-NEXT:    lsr w9, w9, #1
-; CHECK-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NEXT:    sbfx x11, x11, #0, #1
+; CHECK-NEXT:    lsr w9, w8, #3
+; CHECK-NEXT:    lsr w10, w8, #2
+; CHECK-NEXT:    sbfx x11, x8, #0, #1
+; CHECK-NEXT:    lsr w8, w8, #1
 ; CHECK-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
-; CHECK-NEXT:    stp x12, x9, [sp]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    stp x11, x8, [sp]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
@@ -2180,30 +2167,29 @@ define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 {
 ; VBITS_GE_512-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_512-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_512-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_512-NEXT:    ldrb w9, [x2]
-; VBITS_GE_512-NEXT:    mov x8, sp
+; VBITS_GE_512-NEXT:    ldrb w8, [x2]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ptrue p1.d
-; VBITS_GE_512-NEXT:    lsr w10, w9, #7
-; VBITS_GE_512-NEXT:    lsr w11, w9, #6
-; VBITS_GE_512-NEXT:    lsr w12, w9, #5
-; VBITS_GE_512-NEXT:    lsr w13, w9, #4
+; VBITS_GE_512-NEXT:    lsr w9, w8, #7
+; VBITS_GE_512-NEXT:    lsr w10, w8, #6
+; VBITS_GE_512-NEXT:    lsr w11, w8, #5
+; VBITS_GE_512-NEXT:    lsr w12, w8, #4
+; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_512-NEXT:    lsr w14, w9, #3
-; VBITS_GE_512-NEXT:    stp x11, x10, [sp, #48]
-; VBITS_GE_512-NEXT:    lsr w10, w9, #2
-; VBITS_GE_512-NEXT:    stp x13, x12, [sp, #32]
-; VBITS_GE_512-NEXT:    sbfx x12, x9, #0, #1
-; VBITS_GE_512-NEXT:    lsr w9, w9, #1
-; VBITS_GE_512-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_512-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_512-NEXT:    lsr w13, w8, #3
+; VBITS_GE_512-NEXT:    stp x10, x9, [sp, #48]
+; VBITS_GE_512-NEXT:    lsr w9, w8, #2
+; VBITS_GE_512-NEXT:    stp x12, x11, [sp, #32]
+; VBITS_GE_512-NEXT:    sbfx x11, x8, #0, #1
+; VBITS_GE_512-NEXT:    lsr w8, w8, #1
+; VBITS_GE_512-NEXT:    sbfx x10, x13, #0, #1
 ; VBITS_GE_512-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_512-NEXT:    stp x10, x11, [sp, #16]
-; VBITS_GE_512-NEXT:    stp x12, x9, [sp]
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_512-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_512-NEXT:    stp x9, x10, [sp, #16]
+; VBITS_GE_512-NEXT:    stp x11, x8, [sp]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    and z0.d, z0.d, #0x1
@@ -2231,50 +2217,49 @@ define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 {
 ; VBITS_GE_1024-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_1024-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_1024-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_1024-NEXT:    ldrh w9, [x2]
-; VBITS_GE_1024-NEXT:    mov x8, sp
+; VBITS_GE_1024-NEXT:    ldrh w8, [x2]
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
 ; VBITS_GE_1024-NEXT:    ptrue p1.d
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #15
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #14
-; VBITS_GE_1024-NEXT:    lsr w12, w9, #13
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #12
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #15
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #14
+; VBITS_GE_1024-NEXT:    lsr w11, w8, #13
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #12
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w14, w9, #11
-; VBITS_GE_1024-NEXT:    lsr w15, w9, #10
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #112]
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #9
-; VBITS_GE_1024-NEXT:    stp x13, x12, [sp, #96]
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #8
+; VBITS_GE_1024-NEXT:    lsr w13, w8, #11
+; VBITS_GE_1024-NEXT:    lsr w14, w8, #10
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #112]
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #9
+; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #96]
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #8
+; VBITS_GE_1024-NEXT:    sbfx x10, x13, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x12, x15, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w14, w9, #3
-; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #80]
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #6
-; VBITS_GE_1024-NEXT:    stp x13, x10, [sp, #64]
-; VBITS_GE_1024-NEXT:    lsr w10, w9, #7
-; VBITS_GE_1024-NEXT:    lsr w12, w9, #5
-; VBITS_GE_1024-NEXT:    lsr w13, w9, #4
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_1024-NEXT:    lsr w13, w8, #3
+; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #80]
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #6
+; VBITS_GE_1024-NEXT:    stp x12, x9, [sp, #64]
+; VBITS_GE_1024-NEXT:    lsr w9, w8, #7
+; VBITS_GE_1024-NEXT:    lsr w11, w8, #5
+; VBITS_GE_1024-NEXT:    lsr w12, w8, #4
+; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
 ; VBITS_GE_1024-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x13, x13, #0, #1
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #48]
-; VBITS_GE_1024-NEXT:    lsr w11, w9, #2
-; VBITS_GE_1024-NEXT:    stp x13, x12, [sp, #32]
-; VBITS_GE_1024-NEXT:    sbfx x12, x9, #0, #1
-; VBITS_GE_1024-NEXT:    lsr w9, w9, #1
-; VBITS_GE_1024-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_1024-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_1024-NEXT:    stp x11, x10, [sp, #16]
-; VBITS_GE_1024-NEXT:    stp x12, x9, [sp]
-; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #48]
+; VBITS_GE_1024-NEXT:    lsr w10, w8, #2
+; VBITS_GE_1024-NEXT:    stp x12, x11, [sp, #32]
+; VBITS_GE_1024-NEXT:    sbfx x11, x8, #0, #1
+; VBITS_GE_1024-NEXT:    lsr w8, w8, #1
+; VBITS_GE_1024-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_1024-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_1024-NEXT:    stp x10, x9, [sp, #16]
+; VBITS_GE_1024-NEXT:    stp x11, x8, [sp]
+; VBITS_GE_1024-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_1024-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT:    and z0.d, z0.d, #0x1
@@ -2302,121 +2287,120 @@ define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 {
 ; VBITS_GE_2048-NEXT:    .cfi_def_cfa w29, 16
 ; VBITS_GE_2048-NEXT:    .cfi_offset w30, -8
 ; VBITS_GE_2048-NEXT:    .cfi_offset w29, -16
-; VBITS_GE_2048-NEXT:    ldr w9, [x2]
-; VBITS_GE_2048-NEXT:    mov x8, sp
+; VBITS_GE_2048-NEXT:    ldr w8, [x2]
 ; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
 ; VBITS_GE_2048-NEXT:    ptrue p1.d
-; VBITS_GE_2048-NEXT:    ubfx x10, x9, #31, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #30, #2
+; VBITS_GE_2048-NEXT:    ubfx x9, x8, #31, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #30, #2
+; VBITS_GE_2048-NEXT:    // kill: def $w9 killed $w9 killed $x9 def $x9
 ; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #29, #3
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #28, #4
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #29, #3
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #28, #4
+; VBITS_GE_2048-NEXT:    sbfx x9, x9, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #27, #5
+; VBITS_GE_2048-NEXT:    ubfx x14, x8, #26, #6
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #27, #5
-; VBITS_GE_2048-NEXT:    ubfx x15, x9, #26, #6
 ; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    // kill: def $w15 killed $w15 killed $x15 def $x15
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #240]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #240]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x11, x12, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x13, x14, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #25, #7
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #23, #9
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x12, x10, [sp, #224]
-; VBITS_GE_2048-NEXT:    sbfx x10, x15, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #24, #8
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #208]
-; VBITS_GE_2048-NEXT:    sbfx x10, x11, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #22, #10
-; VBITS_GE_2048-NEXT:    sbfx x13, x14, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #21, #11
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x12, x10, [sp, #192]
-; VBITS_GE_2048-NEXT:    sbfx x10, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #20, #12
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #19, #13
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #25, #7
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #23, #9
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #224]
+; VBITS_GE_2048-NEXT:    sbfx x9, x14, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #24, #8
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #176]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #208]
+; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #18, #14
-; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #22, #10
+; VBITS_GE_2048-NEXT:    sbfx x12, x13, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #21, #11
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #17, #15
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #160]
-; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #16, #16
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #15, #17
+; VBITS_GE_2048-NEXT:    stp x11, x9, [sp, #192]
+; VBITS_GE_2048-NEXT:    sbfx x9, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #20, #12
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #19, #13
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #144]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #176]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #18, #14
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #14, #18
-; VBITS_GE_2048-NEXT:    sbfx x13, x13, #0, #1
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #13, #19
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #128]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #12, #20
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #11, #21
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #17, #15
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #160]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #16, #16
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #15, #17
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #112]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #10, #22
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #144]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #14, #18
 ; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #9, #23
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #96]
-; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #8, #24
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #7, #25
 ; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #13, #19
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #80]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #128]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #12, #20
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #11, #21
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #112]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #10, #22
 ; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #6, #26
-; VBITS_GE_2048-NEXT:    sbfx x13, x13, #0, #1
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #5, #27
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #64]
-; VBITS_GE_2048-NEXT:    sbfx x10, x12, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x11, x9, #4, #28
-; VBITS_GE_2048-NEXT:    ubfx x12, x9, #3, #29
-; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #9, #23
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #96]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #8, #24
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #7, #25
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
 ; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
-; VBITS_GE_2048-NEXT:    stp x10, x13, [sp, #48]
-; VBITS_GE_2048-NEXT:    sbfx x10, x14, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
-; VBITS_GE_2048-NEXT:    ubfx x13, x9, #2, #30
-; VBITS_GE_2048-NEXT:    ubfx x14, x9, #1, #31
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #80]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #6, #26
 ; VBITS_GE_2048-NEXT:    sbfx x12, x12, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #5, #27
 ; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
-; VBITS_GE_2048-NEXT:    sbfx x9, x9, #0, #1
-; VBITS_GE_2048-NEXT:    // kill: def $w14 killed $w14 killed $x14 def $x14
-; VBITS_GE_2048-NEXT:    stp x11, x10, [sp, #32]
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #64]
+; VBITS_GE_2048-NEXT:    sbfx x9, x11, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x10, x8, #4, #28
+; VBITS_GE_2048-NEXT:    ubfx x11, x8, #3, #29
+; VBITS_GE_2048-NEXT:    // kill: def $w10 killed $w10 killed $x10 def $x10
+; VBITS_GE_2048-NEXT:    // kill: def $w11 killed $w11 killed $x11 def $x11
+; VBITS_GE_2048-NEXT:    stp x9, x12, [sp, #48]
+; VBITS_GE_2048-NEXT:    sbfx x9, x13, #0, #1
+; VBITS_GE_2048-NEXT:    sbfx x10, x10, #0, #1
+; VBITS_GE_2048-NEXT:    ubfx x12, x8, #2, #30
+; VBITS_GE_2048-NEXT:    ubfx x13, x8, #1, #31
+; VBITS_GE_2048-NEXT:    sbfx x11, x11, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w12 killed $w12 killed $x12 def $x12
+; VBITS_GE_2048-NEXT:    sbfx x8, x8, #0, #1
+; VBITS_GE_2048-NEXT:    // kill: def $w13 killed $w13 killed $x13 def $x13
+; VBITS_GE_2048-NEXT:    stp x10, x9, [sp, #32]
+; VBITS_GE_2048-NEXT:    sbfx x9, x12, #0, #1
 ; VBITS_GE_2048-NEXT:    sbfx x10, x13, #0, #1
-; VBITS_GE_2048-NEXT:    sbfx x11, x14, #0, #1
-; VBITS_GE_2048-NEXT:    stp x10, x12, [sp, #16]
-; VBITS_GE_2048-NEXT:    stp x9, x11, [sp]
-; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; VBITS_GE_2048-NEXT:    stp x9, x11, [sp, #16]
+; VBITS_GE_2048-NEXT:    stp x8, x10, [sp]
+; VBITS_GE_2048-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; VBITS_GE_2048-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    and z0.d, z0.d, #0x1

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
index 55c45eebb0394..7643da08ff71b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
@@ -939,7 +939,6 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    mov z2.d, z1.d[1]
@@ -947,7 +946,7 @@ define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 {
 ; CHECK-NEXT:    mov z1.d, z0.d[3]
 ; CHECK-NEXT:    mov z0.d, z0.d[2]
 ; CHECK-NEXT:    stp d0, d1, [sp]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index 95ecf2582f762..9116a45224e1a 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -506,18 +506,18 @@ define <vscale x 32 x i1> @test_predicate_insert_32xi1(<vscale x 32 x i1> %val,
 ; CHECK-NEXT:    mov x8, #-1
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    st1b { z0.b }, p1, [sp]
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    st1b { z0.b }, p1, [x10, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    st1b { z0.b }, p1, [sp]
-; CHECK-NEXT:    strb w0, [x10, x8]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    strb w0, [x9, x8]
 ; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x10, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p1/z, z0.b, #0

diff  --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 1122fc2c85a8c..68e34dcd2940f 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -213,19 +213,18 @@ define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, <vscale x 16
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str q1, [sp, #32]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x8, #3, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [sp]
-; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #3, mul vl]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #2, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z3.d }, p0, [x0]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -256,13 +255,12 @@ define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %ou
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    str q0, [sp, #16]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp]
-; CHECK-NEXT:    st1d { z0.d }, p0, [x1, #1, mul vl]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -319,11 +317,10 @@ define <vscale x 8 x half> @insert_nxv8f16_nxv2f16(<vscale x 8 x half> %vec, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    st1h { z1.d }, p1, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.d }, p1, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -446,15 +443,14 @@ define <vscale x 6 x i32>  @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    st1w { z2.d }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p1, [sp]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x8, #1, mul vl]
-; CHECK-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    st1w { z1.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-ldnf1.mir b/llvm/test/CodeGen/AArch64/sve-ldnf1.mir
new file mode 100644
index 0000000000000..f0df5c9e6893a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-ldnf1.mir
@@ -0,0 +1,277 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test that prologepilog works for each of the LDNF1 instructions for stack-based objects.
+#
+--- |
+  define void @testcase_positive_offset() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset 63 * readsize
+    ret void
+  }
+  define void @testcase_negative_offset() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset 63 * readsize
+    ret void
+  }
+
+  define void @testcase_positive_offset_out_of_range() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset 64 * readsize
+    ret void
+  }
+  define void @testcase_negative_offset_out_of_range() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset -1 * readsize
+    ret void
+  }
+...
+---
+name:            testcase_positive_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_H_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_S_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_D_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_H_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_S_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_D_IMM renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_IMM renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_S_IMM renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_D_IMM renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_S_IMM renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_D_IMM renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_IMM renamable $p0, $sp, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_D_IMM renamable $p0, $sp, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SW_D_IMM renamable $p0, $sp, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1D_IMM renamable $p0, $sp, 7 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNF1B_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_H_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_S_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_D_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_H_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_S_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_D_IMM renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_IMM renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_S_IMM renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_D_IMM renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_S_IMM renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_D_IMM renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1W_IMM renamable $p0, %stack.1.object, 7 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1W_D_IMM renamable $p0, %stack.1.object, 7 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1SW_D_IMM renamable $p0, %stack.1.object, 7 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1D_IMM renamable $p0, %stack.1.object, 7 :: (load 8 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_negative_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_negative_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_H_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_S_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_D_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_H_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_S_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_D_IMM renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_IMM renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_S_IMM renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_D_IMM renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_S_IMM renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_D_IMM renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_IMM renamable $p0, $sp, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_D_IMM renamable $p0, $sp, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1SW_D_IMM renamable $p0, $sp, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNF1D_IMM renamable $p0, $sp, -8 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNF1B_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_H_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_S_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_D_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_H_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_S_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_D_IMM renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_IMM renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_S_IMM renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_D_IMM renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_S_IMM renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_D_IMM renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1W_IMM renamable $p0, %stack.1.object, -8 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1W_D_IMM renamable $p0, %stack.1.object, -8 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1SW_D_IMM renamable $p0, %stack.1.object, -8 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1D_IMM renamable $p0, %stack.1.object, -8 :: (load 8 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_positive_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_H_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_S_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_D_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_H_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_S_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_D_IMM renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_IMM renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_S_IMM renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_D_IMM renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_S_IMM renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_D_IMM renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_IMM renamable $p0, killed $x8, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_D_IMM renamable $p0, killed $x8, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SW_D_IMM renamable $p0, killed $x8, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNF1B_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_H_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_S_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_D_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_H_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_S_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_D_IMM renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_IMM renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_S_IMM renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_D_IMM renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_S_IMM renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_D_IMM renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1W_IMM renamable $p0, %stack.1.object, 8 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1W_D_IMM renamable $p0, %stack.1.object, 8 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1SW_D_IMM renamable $p0, %stack.1.object, 8 :: (load 4 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_negative_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_negative_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_H_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_S_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNF1B_D_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_H_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_S_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNF1SB_D_IMM renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_IMM renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_S_IMM renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0 = LDNF1H_D_IMM renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_S_IMM renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0 = LDNF1SH_D_IMM renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_IMM renamable $p0, killed $x8, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1W_D_IMM renamable $p0, killed $x8, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDPL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0 = LDNF1SW_D_IMM renamable $p0, killed $x8, -8 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNF1B_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_H_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_S_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1B_D_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_H_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_S_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1SB_D_IMM renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_IMM renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_S_IMM renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1H_D_IMM renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_S_IMM renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1SH_D_IMM renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNF1W_IMM renamable $p0, %stack.1.object, -9 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1W_D_IMM renamable $p0, %stack.1.object, -9 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNF1SW_D_IMM renamable $p0, %stack.1.object, -9 :: (load 4 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...

diff  --git a/llvm/test/CodeGen/AArch64/sve-ldstnt1.mir b/llvm/test/CodeGen/AArch64/sve-ldstnt1.mir
new file mode 100644
index 0000000000000..a5701a6709284
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-ldstnt1.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test that prologepilog works for each of the LDNT1/STNT1 instructions for stack-based objects.
+#
+--- |
+  define void @testcase_positive_offset() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset 7 * readsize
+    ret void
+  }
+  define void @testcase_negative_offset() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset -8 * readsize
+    ret void
+  }
+
+  define void @testcase_positive_offset_out_of_range() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset 8 * readsize
+    ret void
+  }
+  define void @testcase_negative_offset_out_of_range() {
+    %dummy = alloca <vscale x 2 x i64>, align 8
+    %object = alloca <vscale x 2 x i64>, align 8
+    ; Reads from %object at offset -9 * readsize
+    ret void
+  }
+...
+---
+name:            testcase_positive_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: renamable $z0 = LDNT1B_ZRI renamable $p0, $sp, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNT1H_ZRI renamable $p0, $sp, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNT1W_ZRI renamable $p0, $sp, 7 :: (load (s32) from %ir.object, align 8)
+    ; CHECK-NEXT: renamable $z0 = LDNT1D_ZRI renamable $p0, $sp, 7 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: STNT1B_ZRI renamable $z0, renamable $p0, $sp, 7 :: (store (s8) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1H_ZRI renamable $z0, renamable $p0, $sp, 7 :: (store (s16) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1W_ZRI renamable $z0, renamable $p0, $sp, 7 :: (store (s32) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1D_ZRI renamable $z0, renamable $p0, $sp, 7 :: (store (s64) into %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNT1B_ZRI renamable $p0, %stack.1.object, 7 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNT1H_ZRI renamable $p0, %stack.1.object, 7 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNT1W_ZRI renamable $p0, %stack.1.object, 7 :: (load 4 from %ir.object, align 8)
+    renamable $z0 = LDNT1D_ZRI renamable $p0, %stack.1.object, 7 :: (load 8 from %ir.object, align 8)
+    STNT1B_ZRI renamable $z0, renamable $p0, %stack.1.object, 7 :: (store 1 into %ir.object, align 8)
+    STNT1H_ZRI renamable $z0, renamable $p0, %stack.1.object, 7 :: (store 2 into %ir.object, align 8)
+    STNT1W_ZRI renamable $z0, renamable $p0, %stack.1.object, 7 :: (store 4 into %ir.object, align 8)
+    STNT1D_ZRI renamable $z0, renamable $p0, %stack.1.object, 7 :: (store 8 into %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_negative_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_negative_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: renamable $z0 = LDNT1B_ZRI renamable $p0, $sp, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: renamable $z0 = LDNT1H_ZRI renamable $p0, $sp, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNT1W_ZRI renamable $p0, $sp, -8 :: (load (s32) from %ir.object)
+    ; CHECK-NEXT: renamable $z0 = LDNT1D_ZRI renamable $p0, $sp, -8 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: STNT1B_ZRI renamable $z0, renamable $p0, $sp, -8 :: (store (s8) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1H_ZRI renamable $z0, renamable $p0, $sp, -8 :: (store (s16) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1W_ZRI renamable $z0, renamable $p0, $sp, -8 :: (store (s32) into %ir.object, align 8)
+    ; CHECK-NEXT: STNT1D_ZRI renamable $z0, renamable $p0, $sp, -8 :: (store (s64) into %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNT1B_ZRI renamable $p0, %stack.1.object, -8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNT1H_ZRI renamable $p0, %stack.1.object, -8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNT1W_ZRI renamable $p0, %stack.1.object, -8 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LDNT1D_ZRI renamable $p0, %stack.1.object, -8 :: (load 8 from %ir.object, align 8)
+    STNT1B_ZRI renamable $z0, renamable $p0, %stack.1.object, -8 :: (store 1 into %ir.object, align 8)
+    STNT1H_ZRI renamable $z0, renamable $p0, %stack.1.object, -8 :: (store 2 into %ir.object, align 8)
+    STNT1W_ZRI renamable $z0, renamable $p0, %stack.1.object, -8 :: (store 4 into %ir.object, align 8)
+    STNT1D_ZRI renamable $z0, renamable $p0, %stack.1.object, -8 :: (store 8 into %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_positive_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNT1B_ZRI renamable $p0, killed $x8, 7 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNT1H_ZRI renamable $p0, killed $x8, 7 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNT1W_ZRI renamable $p0, killed $x8, 7 :: (load (s32) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: renamable $z0 = LDNT1D_ZRI renamable $p0, killed $x8, 7 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: STNT1B_ZRI renamable $z0, renamable $p0, killed $x8, 7 :: (store (s8) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: STNT1H_ZRI renamable $z0, renamable $p0, killed $x8, 7 :: (store (s16) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: STNT1W_ZRI renamable $z0, renamable $p0, killed $x8, 7 :: (store (s32) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: STNT1D_ZRI renamable $z0, renamable $p0, killed $x8, 7 :: (store (s64) into %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNT1B_ZRI renamable $p0, %stack.1.object, 8 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNT1H_ZRI renamable $p0, %stack.1.object, 8 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNT1W_ZRI renamable $p0, %stack.1.object, 8 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LDNT1D_ZRI renamable $p0, %stack.1.object, 8 :: (load 8 from %ir.object, align 8)
+    STNT1B_ZRI renamable $z0, renamable $p0, %stack.1.object, 8 :: (store 1 into %ir.object, align 8)
+    STNT1H_ZRI renamable $z0, renamable $p0, %stack.1.object, 8 :: (store 2 into %ir.object, align 8)
+    STNT1W_ZRI renamable $z0, renamable $p0, %stack.1.object, 8 :: (store 4 into %ir.object, align 8)
+    STNT1D_ZRI renamable $z0, renamable $p0, %stack.1.object, 8 :: (store 8 into %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+---
+name:            testcase_negative_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+  - { id: 1, name: object, type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_negative_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNT1B_ZRI renamable $p0, killed $x8, -8 :: (load (s8) from %ir.object, align 2)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNT1H_ZRI renamable $p0, killed $x8, -8 :: (load (s16) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNT1W_ZRI renamable $p0, killed $x8, -8 :: (load (s32) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: renamable $z0 = LDNT1D_ZRI renamable $p0, killed $x8, -8 :: (load (s64) from %ir.object)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: STNT1B_ZRI renamable $z0, renamable $p0, killed $x8, -8 :: (store (s8) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: STNT1H_ZRI renamable $z0, renamable $p0, killed $x8, -8 :: (store (s16) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: STNT1W_ZRI renamable $z0, renamable $p0, killed $x8, -8 :: (store (s32) into %ir.object, align 8)
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -1
+    ; CHECK-NEXT: STNT1D_ZRI renamable $z0, renamable $p0, killed $x8, -8 :: (store (s64) into %ir.object)
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    renamable $z0 = LDNT1B_ZRI renamable $p0, %stack.1.object, -9 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LDNT1H_ZRI renamable $p0, %stack.1.object, -9 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LDNT1W_ZRI renamable $p0, %stack.1.object, -9 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LDNT1D_ZRI renamable $p0, %stack.1.object, -9 :: (load 8 from %ir.object, align 8)
+    STNT1B_ZRI renamable $z0, renamable $p0, %stack.1.object, -9 :: (store 1 into %ir.object, align 8)
+    STNT1H_ZRI renamable $z0, renamable $p0, %stack.1.object, -9 :: (store 2 into %ir.object, align 8)
+    STNT1W_ZRI renamable $z0, renamable $p0, %stack.1.object, -9 :: (store 4 into %ir.object, align 8)
+    STNT1D_ZRI renamable $z0, renamable $p0, %stack.1.object, -9 :: (store 8 into %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index 56d2ff25cb15e..accbb533bd8f5 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -26,14 +26,14 @@ define i8 @split_extract_32i8_idx(<vscale x 32 x i8> %a, i32 %idx) {
 ; CHECK-NEXT:    mov x8, #-1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x9, w0
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    st1b { z1.b }, p0, [x10, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK-NEXT:    ldrb w0, [x10, x8]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldrb w0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -51,14 +51,14 @@ define i16 @split_extract_16i16_idx(<vscale x 16 x i16> %a, i32 %idx) {
 ; CHECK-NEXT:    mov x8, #-1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x9, w0
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    st1h { z1.h }, p0, [x10, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    ldrh w0, [x10, x8, lsl #1]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -77,13 +77,13 @@ define i32 @split_extract_8i32_idx(<vscale x 8 x i32> %a, i32 %idx) {
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    st1w { z1.s }, p0, [x10, #1, mul vl]
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    ldr w0, [x10, x8, lsl #2]
+; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -98,19 +98,19 @@ define i64 @split_extract_8i64_idx(<vscale x 8 x i64> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x10, w0
-; CHECK-NEXT:    sub x9, x9, #1
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    sxtw x9, w0
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    csel x9, x10, x9, lo
-; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #3, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-NEXT:    ldr x0, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -146,14 +146,14 @@ define i16 @split_extract_16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mov w10, #128
+; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    addvl x8, x8, #1
-; CHECK-NEXT:    st1h { z1.h }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -170,17 +170,17 @@ define i32 @split_extract_16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w10, #34464
-; CHECK-NEXT:    movk w10, #1, lsl #16
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #34464
+; CHECK-NEXT:    movk w9, #1, lsl #16
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #1
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    st1w { z3.s }, p0, [x9, #3, mul vl]
-; CHECK-NEXT:    csel x8, x8, x10, lo
-; CHECK-NEXT:    st1w { z2.s }, p0, [x9, #2, mul vl]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -197,13 +197,13 @@ define i64 @split_extract_4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #10
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    mov w10, #10
-; CHECK-NEXT:    cmp x8, #10
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    csel x8, x8, x10, lo
-; CHECK-NEXT:    st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    cmp x8, #10
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr x0, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
index 0b9baa23a11fc..0465da7e7093d 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -26,14 +26,14 @@ define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt,
 ; CHECK-NEXT:    mov x8, #-1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    addvl x8, x8, #2
-; CHECK-NEXT:    st1b { z1.b }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    cmp x1, x8
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    strb w0, [x9, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x9, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -53,12 +53,12 @@ define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, floa
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x0, x8, lo
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x9, #1, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -78,16 +78,16 @@ define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x1, x8
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x1, x8, lo
-; CHECK-NEXT:    st1d { z3.d }, p0, [x9, #3, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [x9, #2, mul vl]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str x0, [x9, x8, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9, #1, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x9, #2, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9, #3, mul vl]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -136,21 +136,21 @@ define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt)
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w10, #128
-; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1h { z2.h }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    st1h { z3.h }, p0, [x9, #3, mul vl]
-; CHECK-NEXT:    csel x8, x8, x10, lo
-; CHECK-NEXT:    st1h { z2.h }, p0, [x9, #2, mul vl]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    strh w0, [x9, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x9, #1, mul vl]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x9, #2, mul vl]
-; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x9, #3, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -170,14 +170,14 @@ define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
 ; CHECK-NEXT:    movk w9, #15, lsl #16
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    st1w { z1.s }, p0, [x10, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    str w0, [x10, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
+; CHECK-NEXT:    str w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret


        


More information about the llvm-commits mailing list