[llvm] 6b0807f - [AArch64][GlobalISel] Add support for lowering trunc stores of vector bools.

Mon Jan 6 10:21:45 PST 2025

Author: Amara Emerson
Date: 2025-01-06T10:21:42-08:00
New Revision: 6b0807fe2b8af7361f98f0f947a3129a6ab79f7e

URL: https://github.com/llvm/llvm-project/commit/6b0807fe2b8af7361f98f0f947a3129a6ab79f7e
DIFF: https://github.com/llvm/llvm-project/commit/6b0807fe2b8af7361f98f0f947a3129a6ab79f7e.diff

LOG: [AArch64][GlobalISel] Add support for lowering trunc stores of vector bools.

This is essentially a port of TargetLowering::scalarizeVectorStore(), which
is used for the case where we have something like a store of <8 x s8> truncating
to <8 x s1> in memory. The naive lowering is a sequence of extracts to compute
a scalar value to store.

AArch64's DAG implementation has some more smarts to improve this further which
we can do later.

Reviewers: topperc, davemgreen

Pull Request: https://github.com/llvm/llvm-project/pull/121169

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
    llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index fac059803b9489..4e18f5cc913a7e 100644

--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -302,6 +302,9 @@ class LegalizerHelper {
   /// same type as \p Res.
   MachineInstrBuilder createStackStoreLoad(const DstOp &Res, const SrcOp &Val);
 
+  /// Given a store of a boolean vector, scalarize it.
+  LegalizeResult scalarizeVectorBooleanStore(GStore &MI);
+
   /// Get a pointer to vector element \p Index located in memory for a vector of
   /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out
   /// of bounds the returned pointer is unspecified, but will be within the

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7dece931e8e0eb..0bfa897ecf4047 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4143,9 +4143,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   }
 
   if (MemTy.isVector()) {
-    // TODO: Handle vector trunc stores
     if (MemTy != SrcTy)
-      return UnableToLegalize;
+      return scalarizeVectorBooleanStore(StoreMI);
 
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
@@ -4200,6 +4199,50 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
+  Register SrcReg = StoreMI.getValueReg();
+  Register PtrReg = StoreMI.getPointerReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  MachineMemOperand &MMO = **StoreMI.memoperands_begin();
+  LLT MemTy = MMO.getMemoryType();
+  LLT MemScalarTy = MemTy.getElementType();
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  assert(SrcTy.isVector() && "Expect a vector store type");
+
+  if (!MemScalarTy.isByteSized()) {
+    // We need to build an integer scalar of the vector bit pattern.
+    // It's not legal for us to add padding when storing a vector.
+    unsigned NumBits = MemTy.getSizeInBits();
+    LLT IntTy = LLT::scalar(NumBits);
+    auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
+    LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
+
+    for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
+      auto Elt = MIRBuilder.buildExtractVectorElement(
+          SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
+      auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
+      auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
+      unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
+                                  ? (MemTy.getNumElements() - 1) - I
+                                  : I;
+      auto ShiftAmt = MIRBuilder.buildConstant(
+          IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
+      auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
+      CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
+    }
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
+    MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
+    StoreMI.eraseFromParent();
+    return Legalized;
+  }
+
+  // TODO: implement simple scalarization.
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 09c2825ac1b8cc..c18f8fc81e93f4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -474,7 +474,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  })
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
-      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .lower();
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
index 7b24bb1227fa26..4d461c971d3320 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2  %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1  %s -o - | FileCheck %s
 ---
 name:            scalar_to_oversize_vector
 tracksRegLiveness: true
@@ -48,17 +48,66 @@ body:             |
     G_BR %bb.2
 
 ...
-# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
 ---
 name:            boolean_vector_to_scalar
 tracksRegLiveness: true
 body:             |
     bb.1:
     ; CHECK-LABEL: name: boolean_vector_to_scalar
-    ; CHECK: %vec:_(<8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT %vec(<8 x s1>)
-    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), [[FRAME_INDEX]](p0) :: (store (<8 x s1>) into %stack.0)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C1]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C1]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C1]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s8), [[FRAME_INDEX]](p0) :: (store (s8) into %stack.0)
     ; CHECK-NEXT: %bc:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %stack.0)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %bc(s8)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
index de70f89461780b..1df6297e363833 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=2 %s -o - | FileCheck %s
-# This test currently is expected to fall back after reaching truncstore of <8 x s8> as <8 x s1>.
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
 ---
 name:            store_8xs1
 tracksRegLiveness: true
@@ -13,12 +12,67 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(slt), [[CONCAT_VECTORS]](<8 x s32>), [[BUILD_VECTOR]]
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s8>) = G_ANYEXT [[ICMP]](<8 x s1>)
-    ; CHECK-NEXT: G_STORE [[ANYEXT]](<8 x s8>), %ptr(p0) :: (store (<8 x s1>))
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY]](<4 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY1]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C2]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C2]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C2]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C2]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s8), %ptr(p0) :: (store (s8))
     ; CHECK-NEXT: RET_ReallyLR
     %1:_(<4 x s32>) = COPY $q0
     %2:_(<4 x s32>) = COPY $q1

diff  --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f2eefe5ed72f6..1fa96979f45530 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,26 +1,97 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
+; GISEL: warning: Instruction selection used fallback path for convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask2
+; GISEL-NEXT: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_no_compare
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_compare_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_trunc_in_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_unknown_type_in_long_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_with_
diff erent_types_in_chain
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_4xi8
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_float
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_legalized_illegal_element_size
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_combine_illegal_num_elements
+
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; CHECK-LABEL: convert_to_bitmask16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
-; CHECK-NEXT:    cmeq.16b v0, v0, #0
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+; SDAG-LABEL: convert_to_bitmask16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI0_0 at PAGE
+; SDAG-NEXT:    cmeq.16b v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.16b v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 ; Actual conversion
 
@@ -30,19 +101,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; CHECK-LABEL: convert_to_bitmask8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x8, lCPI1_0 at PAGE
-; CHECK-NEXT:    cmeq.8h v0, v0, #0
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+; SDAG-LABEL: convert_to_bitmask8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI1_0 at PAGE
+; SDAG-NEXT:    cmeq.8h v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -54,16 +156,13 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh4:
 ; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh5:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -74,17 +173,14 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
 ; CHECK-NEXT:    cmeq.2d v0, v0, #0
-; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
@@ -97,16 +193,13 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI4_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -120,9 +213,7 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-LABEL: convert_to_bitmask_no_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
-; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI5_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -130,7 +221,6 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -144,16 +234,13 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v2, v0, #0
 ; CHECK-NEXT:    cmeq.4s v0, v0, v1
-; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
-; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI6_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v0, v2
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -167,10 +254,8 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    adrp x8, lCPI7_0 at PAGE
 ; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI7_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:    cmlt.4s v0, v0, #0
@@ -178,7 +263,6 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -193,7 +277,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
 ; CHECK-NEXT:    movi d2, #0x000000ffffffff
 ; CHECK-NEXT:    movi d3, #0x00ffffffffffff
@@ -207,7 +290,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    mov.h v1[2], wzr
 ; CHECK-NEXT:    orr.8b v0, v0, v3
 ; CHECK-NEXT:    orr.8b v0, v1, v0
-; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI8_0 at PAGEOFF]
 ; CHECK-NEXT:    shl.4h v0, v0, #15
 ; CHECK-NEXT:    cmlt.4h v0, v0, #0
@@ -215,7 +297,6 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -238,17 +319,14 @@ define i4 @convert_to_bitmask_with_
diff erent_types_in_chain(<4 x i16> %vec1, <4
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmeq.4s v1, v1, #0
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v1, v1
 ; CHECK-NEXT:    orn.8b v0, v1, v0
-; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI9_0 at PAGEOFF]
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -259,21 +337,73 @@ define i4 @convert_to_bitmask_with_
diff erent_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    adrp x8, lCPI10_0 at PAGE
-; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
+; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI10_0 at PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -282,16 +412,13 @@ define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x8, lCPI11_0 at PAGE
 ; CHECK-NEXT:    cmeq.2s v0, v0, #0
-; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI11_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addp.2s v0, v0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -302,16 +429,13 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_4xi8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    adrp x8, lCPI12_0 at PAGE
-; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI12_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.4h v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -322,17 +446,14 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8b v1, #3
-; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    adrp x8, lCPI13_0 at PAGE
 ; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI13_0 at PAGEOFF]
 ; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -344,16 +465,13 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
 ; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
-; CHECK-NEXT:  Lloh28:
 ; CHECK-NEXT:    adrp x8, lCPI14_0 at PAGE
 ; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:  Lloh29:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI14_0 at PAGEOFF]
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -364,24 +482,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; CHECK-LABEL: convert_large_vector:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh30:
-; CHECK-NEXT:    adrp x8, lCPI15_0 at PAGE
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:  Lloh31:
-; CHECK-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
+; SDAG-LABEL: convert_large_vector:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI15_0 at PAGE
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_large_vector:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    uzp1.8h v0, v0, v1
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -393,17 +545,14 @@ define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
 ; CHECK-LABEL: convert_legalized_illegal_element_size:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.4s v1, #63, msl #16
-; CHECK-NEXT:  Lloh32:
 ; CHECK-NEXT:    adrp x8, lCPI16_0 at PAGE
 ; CHECK-NEXT:    cmtst.4s v0, v0, v1
-; CHECK-NEXT:  Lloh33:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI16_0 at PAGEOFF]
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.4h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -415,7 +564,6 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmtst.4s v0, v0, v0
-; CHECK-NEXT:  Lloh34:
 ; CHECK-NEXT:    adrp x8, lCPI17_0 at PAGE
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    umov.h w9, v0[0]
@@ -427,14 +575,12 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-NEXT:    umov.h w9, v0[3]
 ; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    shl.8b v0, v1, #7
-; CHECK-NEXT:  Lloh35:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI17_0 at PAGEOFF]
 ; CHECK-NEXT:    cmlt.8b v0, v0, #0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -443,11 +589,18 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; CHECK-LABEL: no_convert_without_direct_bitcast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmtst.8h v0, v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_convert_without_direct_bitcast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmtst.8h v0, v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_convert_without_direct_bitcast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
@@ -492,28 +645,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; CHECK-LABEL: vector_to_vector_cast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh36:
-; CHECK-NEXT:    adrp x8, lCPI20_0 at PAGE
-; CHECK-NEXT:  Lloh37:
-; CHECK-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
-; CHECK-NEXT:    add x8, sp, #14
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    str h0, [sp, #14]
-; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-NEXT:    orr x8, x8, #0x1
-; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+; SDAG-LABEL: vector_to_vector_cast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI20_0 at PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI20_0 at PAGEOFF]
+; SDAG-NEXT:    add x8, sp, #14
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    str h0, [sp, #14]
+; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
+; SDAG-NEXT:    orr x8, x8, #0x1
+; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
+; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: vector_to_vector_cast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    mov d1, v0[1]
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    umov.b w14, v0[2]
+; GISEL-NEXT:    umov.b w15, v0[3]
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    umov.b w17, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w0, v1[1]
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    bfi w13, w10, #1, #31
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w8, v1[0]
+; GISEL-NEXT:    umov.b w10, v1[2]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w13, w13, w14, lsl #2
+; GISEL-NEXT:    umov.b w14, v1[3]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #2
+; GISEL-NEXT:    orr w13, w13, w15, lsl #3
+; GISEL-NEXT:    umov.b w15, v1[4]
+; GISEL-NEXT:    umov.b w11, v0[6]
+; GISEL-NEXT:    bfi w8, w0, #1, #31
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w17, #0x1
+; GISEL-NEXT:    orr w13, w13, w16, lsl #4
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #2
+; GISEL-NEXT:    umov.b w10, v1[5]
+; GISEL-NEXT:    umov.b w16, v1[6]
+; GISEL-NEXT:    orr w13, w13, w17, lsl #5
+; GISEL-NEXT:    umov.b w17, v0[4]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w8, w8, w14, lsl #3
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v1[7]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #3
+; GISEL-NEXT:    orr w11, w13, w11, lsl #6
+; GISEL-NEXT:    orr w8, w8, w15, lsl #4
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w12, w17, #0x1
+; GISEL-NEXT:    umov.b w13, v0[1]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #5
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w12, lsl #4
+; GISEL-NEXT:    umov.b w10, v0[0]
+; GISEL-NEXT:    orr w11, w11, w0, lsl #7
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w16, lsl #6
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    umov.b w12, v0[6]
+; GISEL-NEXT:    strb w11, [sp, #8]
+; GISEL-NEXT:    and w11, w13, #0x1
+; GISEL-NEXT:    umov.b w13, v0[3]
+; GISEL-NEXT:    orr w8, w8, w14, lsl #7
+; GISEL-NEXT:    umov.b w14, v0[7]
+; GISEL-NEXT:    ldr b0, [sp, #8]
+; GISEL-NEXT:    bfi w10, w11, #1, #31
+; GISEL-NEXT:    and w11, w15, #0x1
+; GISEL-NEXT:    strb w8, [sp, #9]
+; GISEL-NEXT:    umov.b w15, v0[4]
+; GISEL-NEXT:    and w8, w12, #0x1
+; GISEL-NEXT:    orr w10, w10, w11, lsl #2
+; GISEL-NEXT:    orr w8, w9, w8, lsl #6
+; GISEL-NEXT:    and w9, w13, #0x1
+; GISEL-NEXT:    umov.b w11, v0[1]
+; GISEL-NEXT:    orr w9, w10, w9, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[0]
+; GISEL-NEXT:    and w13, w14, #0x1
+; GISEL-NEXT:    umov.b w16, v0[2]
+; GISEL-NEXT:    umov.b w17, v0[3]
+; GISEL-NEXT:    and w14, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w13, lsl #7
+; GISEL-NEXT:    orr w9, w9, w14, lsl #4
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v0[3]
+; GISEL-NEXT:    strb w8, [sp, #10]
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    bfi w12, w11, #1, #31
+; GISEL-NEXT:    orr w8, w9, w8, lsl #5
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    and w9, w15, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    orr w9, w12, w9, lsl #2
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w8, w8, w13, lsl #6
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    orr w9, w9, w14, lsl #3
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    umov.b w0, v0[3]
+; GISEL-NEXT:    orr w9, w9, w10, lsl #4
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    umov.b w12, v0[7]
+; GISEL-NEXT:    orr w8, w8, w11, lsl #7
+; GISEL-NEXT:    bfi w13, w15, #1, #31
+; GISEL-NEXT:    and w11, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w10, lsl #5
+; GISEL-NEXT:    and w10, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[4]
+; GISEL-NEXT:    strb w8, [sp, #11]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    umov.b w16, v0[3]
+; GISEL-NEXT:    orr w8, w9, w10, lsl #6
+; GISEL-NEXT:    orr w9, w13, w11, lsl #2
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    and w11, w17, #0x1
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    umov.b w17, v0[0]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #7
+; GISEL-NEXT:    orr w9, w9, w11, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    and w11, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[0]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #4
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    bfi w17, w15, #1, #31
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w12, v0[2]
+; GISEL-NEXT:    bfi w14, w10, #1, #31
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    ldr b1, [sp, #9]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    strb w8, [sp, #12]
+; GISEL-NEXT:    orr w11, w14, w11, lsl #2
+; GISEL-NEXT:    and w14, w16, #0x1
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w13, lsl #6
+; GISEL-NEXT:    orr w11, w11, w14, lsl #3
+; GISEL-NEXT:    orr w12, w17, w12, lsl #2
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[5]
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    orr w10, w11, w10, lsl #4
+; GISEL-NEXT:    orr w12, w12, w17, lsl #3
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    umov.b w17, v0[6]
+; GISEL-NEXT:    orr w10, w10, w15, lsl #5
+; GISEL-NEXT:    umov.b w15, v0[7]
+; GISEL-NEXT:    orr w12, w12, w16, lsl #4
+; GISEL-NEXT:    and w16, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w12, w12, w16, lsl #5
+; GISEL-NEXT:    orr w10, w10, w14, lsl #6
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w17, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #7
+; GISEL-NEXT:    mov.s v0[1], v1[0]
+; GISEL-NEXT:    orr w11, w12, w13, lsl #6
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; GISEL-NEXT:    orr w8, w10, w12, lsl #7
+; GISEL-NEXT:    and w10, w0, #0x1
+; GISEL-NEXT:    strb w9, [sp, #13]
+; GISEL-NEXT:    orr w9, w11, w10, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #14]
+; GISEL-NEXT:    strb w9, [sp, #15]
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }