[llvm] [AArch64][GlobalISel] Legalize vector boolean bitcasts to scalars by lowering via stack. (PR #121171)

Sun Jan 5 21:32:32 PST 2025

https://github.com/aemerson updated https://github.com/llvm/llvm-project/pull/121171

>From 0be38ccf5c865b4fddc357b33c378c70a20532b9 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 26 Dec 2024 16:13:55 -0800
Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5

[skip ci]
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 14 ++++++--
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  1 +
 .../legalize-store-vector-bools.mir           | 32 +++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e2247f76098e97..a931123638ffb9 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3022,8 +3022,18 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       return UnableToLegalize;
 
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (!Ty.isScalar())
-      return UnableToLegalize;
+    if (!Ty.isScalar()) {
+      // We need to widen the vector element type.
+      Observer.changingInstr(MI);
+      widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
+      // We also need to adjust the MMO to turn this into a truncating store.
+      MachineMemOperand &MMO = **MI.memoperands_begin();
+      MachineFunction &MF = MIRBuilder.getMF();
+      auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), Ty);
+      MI.setMemRefs(MF, {NewMMO});
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
 
     Observer.changingInstr(MI);
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4b7d4158faf069..2c35482b7c9e5f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -454,6 +454,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           {nxv2s64, p0, nxv2s64, 8},
       })
       .clampScalar(0, s8, s64)
+      .minScalarOrElt(0, s8)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
new file mode 100644
index 00000000000000..de70f89461780b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2 %s -o - | FileCheck %s
+# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
+---
+name:            store_8xs1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1, $x0
+    ; CHECK-LABEL: name: store_8xs1
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(slt), [[CONCAT_VECTORS]](<8 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT [[ICMP]](<8 x s1>)
+    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), %ptr(p0) :: (store (<8 x s1>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %ptr:_(p0) = COPY $x0
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<8 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32)
+    %5:_(<8 x s1>) = G_ICMP intpred(slt), %0(<8 x s32>), %3
+    G_STORE %5(<8 x s1>), %ptr(p0) :: (store (<8 x s1>))
+    RET_ReallyLR
+...

>From 18da0bff65252d4ef62f7dcefa73b7b508d10bec Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Fri, 27 Dec 2024 10:49:17 -0800
Subject: [PATCH 2/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20introduced=20through=20rebase?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5

[skip ci]
---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a931123638ffb9..7af074a37e13c3 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3022,6 +3022,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       return UnableToLegalize;
 
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
     if (!Ty.isScalar()) {
       // We need to widen the vector element type.
       Observer.changingInstr(MI);

>From b9214baba592d4c7860d714b6d0dffd519a48400 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Fri, 27 Dec 2024 17:34:25 -0800
Subject: [PATCH 3/4] Factor out into funct.

Created using spr 1.3.5
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   3 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  47 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   3 +-
 .../AArch64/GlobalISel/legalize-bitcast.mir   |  59 +-
 .../legalize-store-vector-bools.mir           |  68 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 605 ++++++++++++++----
 6 files changed, 640 insertions(+), 145 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index fac059803b9489..4e18f5cc913a7e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -302,6 +302,9 @@ class LegalizerHelper {
   /// same type as \p Res.
   MachineInstrBuilder createStackStoreLoad(const DstOp &Res, const SrcOp &Val);
 
+  /// Given a store of a boolean vector, scalarize it.
+  LegalizeResult scalarizeVectorBooleanStore(GStore &MI);
+
   /// Get a pointer to vector element \p Index located in memory for a vector of
   /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out
   /// of bounds the returned pointer is unspecified, but will be within the
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7dece931e8e0eb..0bfa897ecf4047 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4143,9 +4143,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   }
 
   if (MemTy.isVector()) {
-    // TODO: Handle vector trunc stores
     if (MemTy != SrcTy)
-      return UnableToLegalize;
+      return scalarizeVectorBooleanStore(StoreMI);
 
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
@@ -4200,6 +4199,50 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
+  Register SrcReg = StoreMI.getValueReg();
+  Register PtrReg = StoreMI.getPointerReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  MachineMemOperand &MMO = **StoreMI.memoperands_begin();
+  LLT MemTy = MMO.getMemoryType();
+  LLT MemScalarTy = MemTy.getElementType();
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  assert(SrcTy.isVector() && "Expect a vector store type");
+
+  if (!MemScalarTy.isByteSized()) {
+    // We need to build an integer scalar of the vector bit pattern.
+    // It's not legal for us to add padding when storing a vector.
+    unsigned NumBits = MemTy.getSizeInBits();
+    LLT IntTy = LLT::scalar(NumBits);
+    auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
+    LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
+
+    for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
+      auto Elt = MIRBuilder.buildExtractVectorElement(
+          SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
+      auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
+      auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
+      unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
+                                  ? (MemTy.getNumElements() - 1) - I
+                                  : I;
+      auto ShiftAmt = MIRBuilder.buildConstant(
+          IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
+      auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
+      CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
+    }
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
+    MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
+    StoreMI.eraseFromParent();
+    return Legalized;
+  }
+
+  // TODO: implement simple scalarization.
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 2fac100f81519a..641f06530a5c23 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -474,7 +474,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  })
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
-      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .lower();
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
index 7b24bb1227fa26..4d461c971d3320 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2  %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1  %s -o - | FileCheck %s
 ---
 name:            scalar_to_oversize_vector
 tracksRegLiveness: true
@@ -48,17 +48,66 @@ body:             |
     G_BR %bb.2
 
 ...
-# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
 ---
 name:            boolean_vector_to_scalar
 tracksRegLiveness: true
 body:             |
     bb.1:
     ; CHECK-LABEL: name: boolean_vector_to_scalar
-    ; CHECK: %vec:_(<8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT %vec(<8 x s1>)
-    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), [[FRAME_INDEX]](p0) :: (store (<8 x s1>) into %stack.0)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C1]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C1]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C1]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s8), [[FRAME_INDEX]](p0) :: (store (s8) into %stack.0)
     ; CHECK-NEXT: %bc:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %stack.0)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %bc(s8)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
index de70f89461780b..1df6297e363833 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2 %s -o - | FileCheck %s
-# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
 ---
 name:            store_8xs1
 tracksRegLiveness: true
@@ -13,12 +12,67 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(slt), [[CONCAT_VECTORS]](<8 x s32>), [[BUILD_VECTOR]]
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT [[ICMP]](<8 x s1>)
-    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), %ptr(p0) :: (store (<8 x s1>))
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY]](<4 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY1]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C2]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C2]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C2]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C2]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s8), %ptr(p0) :: (store (s8))
     ; CHECK-NEXT: RET_ReallyLR
     %1:_(<4 x s32>) = COPY $q0
     %2:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f2eefe5ed72f6..1fa96979f45530 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,26 +1,97 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
+; GISEL: warning: Instruction selection used fallback path for convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask2
+; GISEL-NEXT: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_no_compare
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_compare_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_trunc_in_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_unknown_type_in_long_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_different_types_in_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_4xi8
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_float
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_legalized_illegal_element_size
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_combine_illegal_num_elements
+
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; CHECK-LABEL: convert_to_bitmask16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
-; CHECK-NEXT:    cmeq.16b v0, v0, #0
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+; SDAG-LABEL: convert_to_bitmask16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI0_0 at PAGE
+; SDAG-NEXT:    cmeq.16b v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.16b v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 ; Actual conversion
 
@@ -30,19 +101,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; CHECK-LABEL: convert_to_bitmask8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x8, lCPI1_0 at PAGE
-; CHECK-NEXT:    cmeq.8h v0, v0, #0
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+; SDAG-LABEL: convert_to_bitmask8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI1_0 at PAGE
+; SDAG-NEXT:    cmeq.8h v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -54,16 +156,13 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh4:
 ; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh5:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -74,17 +173,14 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
 ; CHECK-NEXT:    cmeq.2d v0, v0, #0
-; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
@@ -97,16 +193,13 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI4_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -120,9 +213,7 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-LABEL: convert_to_bitmask_no_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
-; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI5_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -130,7 +221,6 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -144,16 +234,13 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v2, v0, #0
 ; CHECK-NEXT:    cmeq.4s v0, v0, v1
-; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
-; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI6_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v0, v2
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -167,10 +254,8 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    adrp x8, lCPI7_0 at PAGE
 ; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI7_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -178,7 +263,6 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -193,7 +277,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
 ; CHECK-NEXT:    movi d2, #0x000000ffffffff
 ; CHECK-NEXT:    movi d3, #0x00ffffffffffff
@@ -207,7 +290,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    mov.h v1[2], wzr
 ; CHECK-NEXT:    orr.8b v0, v0, v3
 ; CHECK-NEXT:    orr.8b v0, v1, v0
-; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI8_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4h v0, v0, #15
 ; CHECK-NEXT:    cmlt.4h v0, v0, #0
@@ -215,7 +297,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -238,17 +319,14 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v1, v1
 ; CHECK-NEXT:    orn.8b v0, v1, v0
-; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI9_0 at PAGEOFF]
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -259,21 +337,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    adrp x8, lCPI10_0 at PAGE
-; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
+; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI10_0 at PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -282,16 +412,13 @@ define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x8, lCPI11_0 at PAGE
 ; CHECK-NEXT:    cmeq.2s v0, v0, #0
-; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI11_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addp.2s v0, v0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -302,16 +429,13 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_4xi8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    adrp x8, lCPI12_0 at PAGE
-; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI12_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -322,17 +446,14 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8b v1, #3
-; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    adrp x8, lCPI13_0 at PAGE
 ; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI13_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -344,16 +465,13 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
 ; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
-; CHECK-NEXT:  Lloh28:
 ; CHECK-NEXT:    adrp x8, lCPI14_0 at PAGE
 ; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:  Lloh29:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI14_0 at PAGEOFF]
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -364,24 +482,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; CHECK-LABEL: convert_large_vector:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh30:
-; CHECK-NEXT:    adrp x8, lCPI15_0 at PAGE
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:  Lloh31:
-; CHECK-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
+; SDAG-LABEL: convert_large_vector:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI15_0 at PAGE
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_large_vector:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    uzp1.8h v0, v0, v1
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -393,17 +545,14 @@ define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
 ; CHECK-LABEL: convert_legalized_illegal_element_size:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.4s v1, #63, msl #16
-; CHECK-NEXT:  Lloh32:
 ; CHECK-NEXT:    adrp x8, lCPI16_0 at PAGE
 ; CHECK-NEXT:    cmtst.4s v0, v0, v1
-; CHECK-NEXT:  Lloh33:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI16_0 at PAGEOFF]
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -415,7 +564,6 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmtst.4s v0, v0, v0
-; CHECK-NEXT:  Lloh34:
 ; CHECK-NEXT:    adrp x8, lCPI17_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    umov.h w9, v0[0]
@@ -427,14 +575,12 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-NEXT:    umov.h w9, v0[3]
 ; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    shl.8b v0, v1, #7
-; CHECK-NEXT:  Lloh35:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI17_0 at PAGEOFF]
 ; CHECK-NEXT:    cmlt.8b v0, v0, #0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -443,11 +589,18 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; CHECK-LABEL: no_convert_without_direct_bitcast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmtst.8h v0, v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_convert_without_direct_bitcast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmtst.8h v0, v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_convert_without_direct_bitcast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
@@ -492,28 +645,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; CHECK-LABEL: vector_to_vector_cast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh36:
-; CHECK-NEXT:    adrp x8, lCPI20_0 at PAGE
-; CHECK-NEXT:  Lloh37:
-; CHECK-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
-; CHECK-NEXT:    add x8, sp, #14
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    str h0, [sp, #14]
-; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-NEXT:    orr x8, x8, #0x1
-; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+; SDAG-LABEL: vector_to_vector_cast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI20_0 at PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
+; SDAG-NEXT:    add x8, sp, #14
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    str h0, [sp, #14]
+; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
+; SDAG-NEXT:    orr x8, x8, #0x1
+; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
+; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: vector_to_vector_cast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    mov d1, v0[1]
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    umov.b w14, v0[2]
+; GISEL-NEXT:    umov.b w15, v0[3]
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    umov.b w17, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w0, v1[1]
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    bfi w13, w10, #1, #31
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w8, v1[0]
+; GISEL-NEXT:    umov.b w10, v1[2]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w13, w13, w14, lsl #2
+; GISEL-NEXT:    umov.b w14, v1[3]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #2
+; GISEL-NEXT:    orr w13, w13, w15, lsl #3
+; GISEL-NEXT:    umov.b w15, v1[4]
+; GISEL-NEXT:    umov.b w11, v0[6]
+; GISEL-NEXT:    bfi w8, w0, #1, #31
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w17, #0x1
+; GISEL-NEXT:    orr w13, w13, w16, lsl #4
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #2
+; GISEL-NEXT:    umov.b w10, v1[5]
+; GISEL-NEXT:    umov.b w16, v1[6]
+; GISEL-NEXT:    orr w13, w13, w17, lsl #5
+; GISEL-NEXT:    umov.b w17, v0[4]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w8, w8, w14, lsl #3
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v1[7]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #3
+; GISEL-NEXT:    orr w11, w13, w11, lsl #6
+; GISEL-NEXT:    orr w8, w8, w15, lsl #4
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w12, w17, #0x1
+; GISEL-NEXT:    umov.b w13, v0[1]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #5
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w12, lsl #4
+; GISEL-NEXT:    umov.b w10, v0[0]
+; GISEL-NEXT:    orr w11, w11, w0, lsl #7
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w16, lsl #6
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    umov.b w12, v0[6]
+; GISEL-NEXT:    strb w11, [sp, #8]
+; GISEL-NEXT:    and w11, w13, #0x1
+; GISEL-NEXT:    umov.b w13, v0[3]
+; GISEL-NEXT:    orr w8, w8, w14, lsl #7
+; GISEL-NEXT:    umov.b w14, v0[7]
+; GISEL-NEXT:    ldr b0, [sp, #8]
+; GISEL-NEXT:    bfi w10, w11, #1, #31
+; GISEL-NEXT:    and w11, w15, #0x1
+; GISEL-NEXT:    strb w8, [sp, #9]
+; GISEL-NEXT:    umov.b w15, v0[4]
+; GISEL-NEXT:    and w8, w12, #0x1
+; GISEL-NEXT:    orr w10, w10, w11, lsl #2
+; GISEL-NEXT:    orr w8, w9, w8, lsl #6
+; GISEL-NEXT:    and w9, w13, #0x1
+; GISEL-NEXT:    umov.b w11, v0[1]
+; GISEL-NEXT:    orr w9, w10, w9, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[0]
+; GISEL-NEXT:    and w13, w14, #0x1
+; GISEL-NEXT:    umov.b w16, v0[2]
+; GISEL-NEXT:    umov.b w17, v0[3]
+; GISEL-NEXT:    and w14, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w13, lsl #7
+; GISEL-NEXT:    orr w9, w9, w14, lsl #4
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v0[3]
+; GISEL-NEXT:    strb w8, [sp, #10]
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    bfi w12, w11, #1, #31
+; GISEL-NEXT:    orr w8, w9, w8, lsl #5
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    and w9, w15, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    orr w9, w12, w9, lsl #2
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w8, w8, w13, lsl #6
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    orr w9, w9, w14, lsl #3
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    umov.b w0, v0[3]
+; GISEL-NEXT:    orr w9, w9, w10, lsl #4
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    umov.b w12, v0[7]
+; GISEL-NEXT:    orr w8, w8, w11, lsl #7
+; GISEL-NEXT:    bfi w13, w15, #1, #31
+; GISEL-NEXT:    and w11, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w10, lsl #5
+; GISEL-NEXT:    and w10, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[4]
+; GISEL-NEXT:    strb w8, [sp, #11]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    umov.b w16, v0[3]
+; GISEL-NEXT:    orr w8, w9, w10, lsl #6
+; GISEL-NEXT:    orr w9, w13, w11, lsl #2
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    and w11, w17, #0x1
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    umov.b w17, v0[0]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #7
+; GISEL-NEXT:    orr w9, w9, w11, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    and w11, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[0]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #4
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    bfi w17, w15, #1, #31
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w12, v0[2]
+; GISEL-NEXT:    bfi w14, w10, #1, #31
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    ldr b1, [sp, #9]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    strb w8, [sp, #12]
+; GISEL-NEXT:    orr w11, w14, w11, lsl #2
+; GISEL-NEXT:    and w14, w16, #0x1
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w13, lsl #6
+; GISEL-NEXT:    orr w11, w11, w14, lsl #3
+; GISEL-NEXT:    orr w12, w17, w12, lsl #2
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[5]
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    orr w10, w11, w10, lsl #4
+; GISEL-NEXT:    orr w12, w12, w17, lsl #3
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    umov.b w17, v0[6]
+; GISEL-NEXT:    orr w10, w10, w15, lsl #5
+; GISEL-NEXT:    umov.b w15, v0[7]
+; GISEL-NEXT:    orr w12, w12, w16, lsl #4
+; GISEL-NEXT:    and w16, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w12, w12, w16, lsl #5
+; GISEL-NEXT:    orr w10, w10, w14, lsl #6
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w17, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #7
+; GISEL-NEXT:    mov.s v0[1], v1[0]
+; GISEL-NEXT:    orr w11, w12, w13, lsl #6
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; GISEL-NEXT:    orr w8, w10, w12, lsl #7
+; GISEL-NEXT:    and w10, w0, #0x1
+; GISEL-NEXT:    strb w9, [sp, #13]
+; GISEL-NEXT:    orr w9, w11, w10, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #14]
+; GISEL-NEXT:    strb w9, [sp, #15]
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }

>From 366d57a3b43847b25b370a6da0e439e1bf7e7435 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Fri, 27 Dec 2024 20:54:02 -0800
Subject: [PATCH 4/4] Try to fix commit stack

Created using spr 1.3.5
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   3 -
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  47 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   3 +-
 .../AArch64/GlobalISel/legalize-bitcast.mir   |  59 +-
 .../legalize-store-vector-bools.mir           |  68 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 605 ++++--------------
 6 files changed, 145 insertions(+), 640 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 4e18f5cc913a7e..fac059803b9489 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -302,9 +302,6 @@ class LegalizerHelper {
   /// same type as \p Res.
   MachineInstrBuilder createStackStoreLoad(const DstOp &Res, const SrcOp &Val);
 
-  /// Given a store of a boolean vector, scalarize it.
-  LegalizeResult scalarizeVectorBooleanStore(GStore &MI);
-
   /// Get a pointer to vector element \p Index located in memory for a vector of
   /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out
   /// of bounds the returned pointer is unspecified, but will be within the
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 0bfa897ecf4047..7dece931e8e0eb 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4143,8 +4143,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   }
 
   if (MemTy.isVector()) {
+    // TODO: Handle vector trunc stores
     if (MemTy != SrcTy)
-      return scalarizeVectorBooleanStore(StoreMI);
+      return UnableToLegalize;
 
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
@@ -4199,50 +4200,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
-  Register SrcReg = StoreMI.getValueReg();
-  Register PtrReg = StoreMI.getPointerReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-  MachineMemOperand &MMO = **StoreMI.memoperands_begin();
-  LLT MemTy = MMO.getMemoryType();
-  LLT MemScalarTy = MemTy.getElementType();
-  MachineFunction &MF = MIRBuilder.getMF();
-
-  assert(SrcTy.isVector() && "Expect a vector store type");
-
-  if (!MemScalarTy.isByteSized()) {
-    // We need to build an integer scalar of the vector bit pattern.
-    // It's not legal for us to add padding when storing a vector.
-    unsigned NumBits = MemTy.getSizeInBits();
-    LLT IntTy = LLT::scalar(NumBits);
-    auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
-    LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
-
-    for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
-      auto Elt = MIRBuilder.buildExtractVectorElement(
-          SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
-      auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
-      auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
-      unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
-                                  ? (MemTy.getNumElements() - 1) - I
-                                  : I;
-      auto ShiftAmt = MIRBuilder.buildConstant(
-          IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
-      auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
-      CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
-    }
-    auto PtrInfo = MMO.getPointerInfo();
-    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
-    MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
-    StoreMI.eraseFromParent();
-    return Legalized;
-  }
-
-  // TODO: implement simple scalarization.
-  return UnableToLegalize;
-}
-
 LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 641f06530a5c23..2fac100f81519a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -474,8 +474,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  })
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
-      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
-      .lower();
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
index 4d461c971d3320..7b24bb1227fa26 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1  %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2  %s -o - | FileCheck %s
 ---
 name:            scalar_to_oversize_vector
 tracksRegLiveness: true
@@ -48,66 +48,17 @@ body:             |
     G_BR %bb.2
 
 ...
+# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
 ---
 name:            boolean_vector_to_scalar
 tracksRegLiveness: true
 body:             |
     bb.1:
     ; CHECK-LABEL: name: boolean_vector_to_scalar
-    ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK: %vec:_(<8 x s1>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C]](s64)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C3]](s64)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C4]](s64)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C5]](s64)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C1]]
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C6]](s64)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C1]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
-    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C7]](s64)
-    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C1]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C8]](s64)
-    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C1]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
-    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C9]](s64)
-    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C1]]
-    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
-    ; CHECK-NEXT: G_STORE [[TRUNC]](s8), [[FRAME_INDEX]](p0) :: (store (s8) into %stack.0)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT %vec(<8 x s1>)
+    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), [[FRAME_INDEX]](p0) :: (store (<8 x s1>) into %stack.0)
     ; CHECK-NEXT: %bc:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %stack.0)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %bc(s8)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
index 1df6297e363833..de70f89461780b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2 %s -o - | FileCheck %s
+# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
 ---
 name:            store_8xs1
 tracksRegLiveness: true
@@ -12,67 +13,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY]](<4 x s32>), [[BUILD_VECTOR]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY1]](<4 x s32>), [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C1]](s64)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C3]](s64)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C4]](s64)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C5]](s64)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C6]](s64)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C2]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
-    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C7]](s64)
-    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C2]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C8]](s64)
-    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C2]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
-    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C9]](s64)
-    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C2]]
-    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
-    ; CHECK-NEXT: G_STORE [[TRUNC3]](s8), %ptr(p0) :: (store (s8))
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(slt), [[CONCAT_VECTORS]](<8 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT [[ICMP]](<8 x s1>)
+    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), %ptr(p0) :: (store (<8 x s1>))
     ; CHECK-NEXT: RET_ReallyLR
     %1:_(<4 x s32>) = COPY $q0
     %2:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 1fa96979f45530..7f2eefe5ed72f6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,97 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
-; GISEL: warning: Instruction selection used fallback path for convert_to_bitmask4
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask2
-; GISEL-NEXT: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_no_compare
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_compare_chain
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_trunc_in_chain
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_unknown_type_in_long_chain
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_different_types_in_chain
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_4xi8
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_float
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_legalized_illegal_element_size
-; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
-; GISEL-NEXT: warning: Instruction selection used fallback path for no_combine_illegal_num_elements
-
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; SDAG-LABEL: convert_to_bitmask16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI0_0 at PAGE
-; SDAG-NEXT:    cmeq.16b v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: convert_to_bitmask16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.16b v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-LABEL: convert_to_bitmask16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
+; CHECK-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 
 ; Actual conversion
 
@@ -101,50 +30,19 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; SDAG-LABEL: convert_to_bitmask8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI1_0 at PAGE
-; SDAG-NEXT:    cmeq.8h v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: convert_to_bitmask8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-LABEL: convert_to_bitmask8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x8, lCPI1_0 at PAGE
+; CHECK-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -156,13 +54,16 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh4:
 ; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh5:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -173,14 +74,17 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
 ; CHECK-NEXT:    cmeq.2d v0, v0, #0
+; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
@@ -193,13 +97,16 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI4_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -213,7 +120,9 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-LABEL: convert_to_bitmask_no_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI5_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -221,6 +130,7 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -234,13 +144,16 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v2, v0, #0
 ; CHECK-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI6_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v0, v2
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -254,8 +167,10 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    adrp x8, lCPI7_0 at PAGE
 ; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI7_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -263,6 +178,7 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -277,6 +193,7 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
 ; CHECK-NEXT:    movi d2, #0x000000ffffffff
 ; CHECK-NEXT:    movi d3, #0x00ffffffffffff
@@ -290,6 +207,7 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    mov.h v1[2], wzr
 ; CHECK-NEXT:    orr.8b v0, v0, v3
 ; CHECK-NEXT:    orr.8b v0, v1, v0
+; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI8_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4h v0, v0, #15
 ; CHECK-NEXT:    cmlt.4h v0, v0, #0
@@ -297,6 +215,7 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -319,14 +238,17 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v1, v1
 ; CHECK-NEXT:    orn.8b v0, v1, v0
+; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI9_0 at PAGEOFF]
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -337,73 +259,21 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI10_0 at PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:  Lloh20:
+; CHECK-NEXT:    adrp x8, lCPI10_0 at PAGE
+; CHECK-NEXT:  Lloh21:
+; CHECK-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -412,13 +282,16 @@ define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x8, lCPI11_0 at PAGE
 ; CHECK-NEXT:    cmeq.2s v0, v0, #0
+; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI11_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addp.2s v0, v0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -429,13 +302,16 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_4xi8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    adrp x8, lCPI12_0 at PAGE
+; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI12_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -446,14 +322,17 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8b v1, #3
+; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    adrp x8, lCPI13_0 at PAGE
 ; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI13_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -465,13 +344,16 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
 ; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-NEXT:  Lloh28:
 ; CHECK-NEXT:    adrp x8, lCPI14_0 at PAGE
 ; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:  Lloh29:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI14_0 at PAGEOFF]
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -482,58 +364,24 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; SDAG-LABEL: convert_large_vector:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI15_0 at PAGE
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: convert_large_vector:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    uzp1.8h v0, v0, v1
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-LABEL: convert_large_vector:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh30:
+; CHECK-NEXT:    adrp x8, lCPI15_0 at PAGE
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:  Lloh31:
+; CHECK-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -545,14 +393,17 @@ define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
 ; CHECK-LABEL: convert_legalized_illegal_element_size:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-NEXT:  Lloh32:
 ; CHECK-NEXT:    adrp x8, lCPI16_0 at PAGE
 ; CHECK-NEXT:    cmtst.4s v0, v0, v1
+; CHECK-NEXT:  Lloh33:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI16_0 at PAGEOFF]
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -564,6 +415,7 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmtst.4s v0, v0, v0
+; CHECK-NEXT:  Lloh34:
 ; CHECK-NEXT:    adrp x8, lCPI17_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    umov.h w9, v0[0]
@@ -575,12 +427,14 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-NEXT:    umov.h w9, v0[3]
 ; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    shl.8b v0, v1, #7
+; CHECK-NEXT:  Lloh35:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI17_0 at PAGEOFF]
 ; CHECK-NEXT:    cmlt.8b v0, v0, #0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -589,18 +443,11 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; SDAG-LABEL: no_convert_without_direct_bitcast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmtst.8h v0, v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: no_convert_without_direct_bitcast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    ret
+; CHECK-LABEL: no_convert_without_direct_bitcast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmtst.8h v0, v0, v0
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
@@ -645,220 +492,28 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; SDAG-LABEL: vector_to_vector_cast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI20_0 at PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
-; SDAG-NEXT:    add x8, sp, #14
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    str h0, [sp, #14]
-; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
-; SDAG-NEXT:    orr x8, x8, #0x1
-; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
-; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: vector_to_vector_cast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    mov d1, v0[1]
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    umov.b w14, v0[2]
-; GISEL-NEXT:    umov.b w15, v0[3]
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    umov.b w17, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w0, v1[1]
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    bfi w13, w10, #1, #31
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w8, v1[0]
-; GISEL-NEXT:    umov.b w10, v1[2]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w13, w13, w14, lsl #2
-; GISEL-NEXT:    umov.b w14, v1[3]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #2
-; GISEL-NEXT:    orr w13, w13, w15, lsl #3
-; GISEL-NEXT:    umov.b w15, v1[4]
-; GISEL-NEXT:    umov.b w11, v0[6]
-; GISEL-NEXT:    bfi w8, w0, #1, #31
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w17, #0x1
-; GISEL-NEXT:    orr w13, w13, w16, lsl #4
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #2
-; GISEL-NEXT:    umov.b w10, v1[5]
-; GISEL-NEXT:    umov.b w16, v1[6]
-; GISEL-NEXT:    orr w13, w13, w17, lsl #5
-; GISEL-NEXT:    umov.b w17, v0[4]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w8, w8, w14, lsl #3
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v1[7]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #3
-; GISEL-NEXT:    orr w11, w13, w11, lsl #6
-; GISEL-NEXT:    orr w8, w8, w15, lsl #4
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w12, w17, #0x1
-; GISEL-NEXT:    umov.b w13, v0[1]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #5
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w12, lsl #4
-; GISEL-NEXT:    umov.b w10, v0[0]
-; GISEL-NEXT:    orr w11, w11, w0, lsl #7
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w16, lsl #6
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    umov.b w12, v0[6]
-; GISEL-NEXT:    strb w11, [sp, #8]
-; GISEL-NEXT:    and w11, w13, #0x1
-; GISEL-NEXT:    umov.b w13, v0[3]
-; GISEL-NEXT:    orr w8, w8, w14, lsl #7
-; GISEL-NEXT:    umov.b w14, v0[7]
-; GISEL-NEXT:    ldr b0, [sp, #8]
-; GISEL-NEXT:    bfi w10, w11, #1, #31
-; GISEL-NEXT:    and w11, w15, #0x1
-; GISEL-NEXT:    strb w8, [sp, #9]
-; GISEL-NEXT:    umov.b w15, v0[4]
-; GISEL-NEXT:    and w8, w12, #0x1
-; GISEL-NEXT:    orr w10, w10, w11, lsl #2
-; GISEL-NEXT:    orr w8, w9, w8, lsl #6
-; GISEL-NEXT:    and w9, w13, #0x1
-; GISEL-NEXT:    umov.b w11, v0[1]
-; GISEL-NEXT:    orr w9, w10, w9, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[0]
-; GISEL-NEXT:    and w13, w14, #0x1
-; GISEL-NEXT:    umov.b w16, v0[2]
-; GISEL-NEXT:    umov.b w17, v0[3]
-; GISEL-NEXT:    and w14, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w13, lsl #7
-; GISEL-NEXT:    orr w9, w9, w14, lsl #4
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v0[3]
-; GISEL-NEXT:    strb w8, [sp, #10]
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    bfi w12, w11, #1, #31
-; GISEL-NEXT:    orr w8, w9, w8, lsl #5
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    and w9, w15, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    orr w9, w12, w9, lsl #2
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w8, w8, w13, lsl #6
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    orr w9, w9, w14, lsl #3
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    umov.b w0, v0[3]
-; GISEL-NEXT:    orr w9, w9, w10, lsl #4
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    umov.b w12, v0[7]
-; GISEL-NEXT:    orr w8, w8, w11, lsl #7
-; GISEL-NEXT:    bfi w13, w15, #1, #31
-; GISEL-NEXT:    and w11, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w10, lsl #5
-; GISEL-NEXT:    and w10, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[4]
-; GISEL-NEXT:    strb w8, [sp, #11]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    umov.b w16, v0[3]
-; GISEL-NEXT:    orr w8, w9, w10, lsl #6
-; GISEL-NEXT:    orr w9, w13, w11, lsl #2
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    and w11, w17, #0x1
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    umov.b w17, v0[0]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #7
-; GISEL-NEXT:    orr w9, w9, w11, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    and w11, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[0]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #4
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    bfi w17, w15, #1, #31
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w12, v0[2]
-; GISEL-NEXT:    bfi w14, w10, #1, #31
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    ldr b1, [sp, #9]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    strb w8, [sp, #12]
-; GISEL-NEXT:    orr w11, w14, w11, lsl #2
-; GISEL-NEXT:    and w14, w16, #0x1
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w13, lsl #6
-; GISEL-NEXT:    orr w11, w11, w14, lsl #3
-; GISEL-NEXT:    orr w12, w17, w12, lsl #2
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[5]
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    orr w10, w11, w10, lsl #4
-; GISEL-NEXT:    orr w12, w12, w17, lsl #3
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    umov.b w17, v0[6]
-; GISEL-NEXT:    orr w10, w10, w15, lsl #5
-; GISEL-NEXT:    umov.b w15, v0[7]
-; GISEL-NEXT:    orr w12, w12, w16, lsl #4
-; GISEL-NEXT:    and w16, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w12, w12, w16, lsl #5
-; GISEL-NEXT:    orr w10, w10, w14, lsl #6
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w17, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #7
-; GISEL-NEXT:    mov.s v0[1], v1[0]
-; GISEL-NEXT:    orr w11, w12, w13, lsl #6
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT:    orr w8, w10, w12, lsl #7
-; GISEL-NEXT:    and w10, w0, #0x1
-; GISEL-NEXT:    strb w9, [sp, #13]
-; GISEL-NEXT:    orr w9, w11, w10, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #14]
-; GISEL-NEXT:    strb w9, [sp, #15]
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-LABEL: vector_to_vector_cast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:  Lloh36:
+; CHECK-NEXT:    adrp x8, lCPI20_0 at PAGE
+; CHECK-NEXT:  Lloh37:
+; CHECK-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
+; CHECK-NEXT:    add x8, sp, #14
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    str h0, [sp, #14]
+; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-NEXT:    orr x8, x8, #0x1
+; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }