[llvm] fbc2473 - [AArch64][GlobalISel] Legalization for small anyext/sext/zext (#86438)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 26 02:48:11 PDT 2024


Author: David Green
Date: 2024-03-26T09:48:06Z
New Revision: fbc247367a86e8a711584801586ec9616268d889

URL: https://github.com/llvm/llvm-project/commit/fbc247367a86e8a711584801586ec9616268d889
DIFF: https://github.com/llvm/llvm-project/commit/fbc247367a86e8a711584801586ec9616268d889.diff

LOG: [AArch64][GlobalISel] Legalization for small anyext/sext/zext (#86438)

Similar to #85625, some of the codegen is still far from optimal but
this helps fix quite a few fallback cases.

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
    llvm/test/CodeGen/AArch64/aarch64-smull.ll
    llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
    llvm/test/CodeGen/AArch64/bitcast.ll
    llvm/test/CodeGen/AArch64/bswap.ll
    llvm/test/CodeGen/AArch64/fptoi.ll
    llvm/test/CodeGen/AArch64/itofp.ll
    llvm/test/CodeGen/AArch64/load.ll
    llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
    llvm/test/CodeGen/AArch64/shift.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 1b25da8833e4fb..c3a23ea0ad3738 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5421,14 +5421,22 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP: {
-    if (TypeIdx != 0)
-      return UnableToLegalize;
     Observer.changingInstr(MI);
-    LLT SrcTy = LLT::fixed_vector(
-        MoreTy.getNumElements(),
-        MRI.getType(MI.getOperand(1).getReg()).getElementType());
-    moreElementsVectorSrc(MI, SrcTy, 1);
-    moreElementsVectorDst(MI, MoreTy, 0);
+    LLT SrcExtTy;
+    LLT DstExtTy;
+    if (TypeIdx == 0) {
+      DstExtTy = MoreTy;
+      SrcExtTy = LLT::fixed_vector(
+          MoreTy.getNumElements(),
+          MRI.getType(MI.getOperand(1).getReg()).getElementType());
+    } else {
+      DstExtTy = LLT::fixed_vector(
+          MoreTy.getNumElements(),
+          MRI.getType(MI.getOperand(0).getReg()).getElementType());
+      SrcExtTy = MoreTy;
+    }
+    moreElementsVectorSrc(MI, SrcExtTy, 1);
+    moreElementsVectorDst(MI, DstExtTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
   }

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 34e2c1d9c8e2f9..33dba6a5c61eaf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -611,7 +611,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                Query.Types[0].isVector() &&
                (Query.Types[1].getScalarSizeInBits() == 8 ||
                 Query.Types[1].getScalarSizeInBits() == 16);
-      });
+      })
+      .clampMinNumElements(1, s8, 8)
+      .clampMinNumElements(1, s16, 4);
 
   getActionDefinitionsBuilder(G_TRUNC)
       .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
@@ -630,7 +632,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .clampMinNumElements(0, s8, 8)
       .clampMinNumElements(0, s16, 4)
-      .clampMinNumElements(0, s32, 2)
       .alwaysLegal();
 
   getActionDefinitionsBuilder(G_SEXT_INREG)

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
index c9556e27c6349a..a63d8b9c137725 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
@@ -121,10 +121,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $h1
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY1]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32)
-    ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[ANYEXT]](<4 x s32>)
+    ; CHECK-NEXT: $d0 = COPY [[UV]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(s16) = COPY $h0
     %1:_(s16) = COPY $h1
@@ -141,8 +142,8 @@ body: |
     ; CHECK-LABEL: name: widen_v2s8
     ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
-    ; CHECK-NEXT: %3:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[DEF]](s32)
-    ; CHECK-NEXT: $d0 = COPY %3(<2 x s32>)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(s8) = G_IMPLICIT_DEF
     %1:_(s8) = G_IMPLICIT_DEF
@@ -157,12 +158,14 @@ name:            widen_v4s8
 body: |
   bb.0:
     ; CHECK-LABEL: name: widen_v4s8
-    ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF]](s16)
-    ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF1]](s8), [[DEF2]](s8), [[DEF3]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
+    ; CHECK-NEXT: $d0 = COPY [[UV]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(s8) = G_IMPLICIT_DEF
     %1:_(s8) = G_IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
index e12353c7ef5bec..d3db2432e84cbc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
@@ -235,31 +235,32 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY3]](s32)
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-  ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[C2]](s16), [[C1]](s64)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
-  ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
-  ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
-  ; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
+  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[UV]], [[C2]](s16), [[C1]](s64)
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
+  ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
+  ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
+  ; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>)
   ; CHECK-NEXT:   [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>)
-  ; CHECK-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
-  ; CHECK-NEXT:   G_STORE [[UV4]](s32), [[COPY]](p0) :: (store (s32), align 16)
+  ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[UV6]](s32), [[COPY]](p0) :: (store (s32), align 16)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-  ; CHECK-NEXT:   G_STORE [[UV5]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
+  ; CHECK-NEXT:   G_STORE [[UV7]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-  ; CHECK-NEXT:   G_STORE [[UV6]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
+  ; CHECK-NEXT:   G_STORE [[UV8]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
   ; CHECK-NEXT:   G_BR %bb.1
   bb.1:
     liveins: $w1, $w2, $w3, $x0

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
index 63a26dcfea4762..e49a94c12ed468 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
@@ -293,41 +293,44 @@ body:             |
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]]
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF1]](s16)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[TRUNC]](s16), [[C1]](s64)
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
-    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
-    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[UV]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
+    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV5]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<8 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s8>), [[UV5:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[SHUF]](<8 x s8>)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY3]](s16), [[COPY4]](s16), [[COPY5]](s16), [[C2]](s16)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[SHUF]](<8 x s8>)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV6]](<4 x s8>)
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV11]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR3]](<8 x s8>)
+    ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<4 x s16>), [[UV13:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR4]](<8 x s8>)
+    ; CHECK-NEXT: [[UV14:%[0-9]+]]:_(<4 x s16>), [[UV15:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV12]], [[UV14]]
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC5]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[UV16:%[0-9]+]]:_(s8), [[UV17:%[0-9]+]]:_(s8), [[UV18:%[0-9]+]]:_(s8), [[UV19:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV6]](<4 x s8>)
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[UV16]](s8), [[UV17]](s8), [[UV18]](s8), [[UV19]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR5]](<8 x s8>)
+    ; CHECK-NEXT: [[UV20:%[0-9]+]]:_(<4 x s16>), [[UV21:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT3]](<8 x s16>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC5]], [[UV20]]
     ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC6]], [[XOR]]
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR]](<4 x s16>)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR]](<4 x s16>)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32)
-    ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_AND [[ANYEXT2]], [[BUILD_VECTOR4]]
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32)
+    ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_AND [[ANYEXT4]], [[BUILD_VECTOR6]]
     ; CHECK-NEXT: $q0 = COPY %zext_select(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %w0:_(s32) = COPY $w0

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
index 42a8f51002f200..f7efaeaa507053 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
@@ -1,16 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel-abort=2 -global-isel -o - %s | FileCheck %s
+; RUN: llc -global-isel -o - %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
 
 declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>) #0
 
-; This test currently falls back but ensures we don't crash.
-
 define i32 @bar() {
 ; CHECK-LABEL: bar:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    mov b1, v0[1]
+; CHECK-NEXT:    mov b2, v0[2]
+; CHECK-NEXT:    mov b3, v0[3]
+; CHECK-NEXT:    mov.h v0[1], v1[0]
+; CHECK-NEXT:    mov.h v2[1], v3[0]
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v1, v2, #0
+; CHECK-NEXT:    mov.d v0[1], v1[0]
+; CHECK-NEXT:    movi.4s v1, #1
+; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index ed40a2ff7ea70f..e729f027baa715 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -541,17 +541,13 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR1]](<4 x s32>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s8>), [[UV3:%[0-9]+]]:_(<2 x s8>), [[UV4:%[0-9]+]]:_(<2 x s8>), [[UV5:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[TRUNC2]](<8 x s8>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[UV2]](<2 x s8>), [[UV2]](<2 x s8>)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS1]](<4 x s8>)
-    ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC]](s8), [[TRUNC1]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s16>), [[UV3:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
+    ; CHECK-NEXT: $d0 = COPY [[UV2]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s8>) = G_TRUNC %0(<2 x s32>)

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index dbc5417e23133d..61a4f64ac2bfcb 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -3,8 +3,7 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for smull_zext_v4i16_v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
+; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v8i16_uzp1
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v8i16_uzp1
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v4i32_uzp1
@@ -189,13 +188,49 @@ define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_zext_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr s0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_zext_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr s0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    uxtb w8, w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    uxtb w9, w9
+; CHECK-GI-NEXT:    uxtb w10, w10
+; CHECK-GI-NEXT:    uxtb w11, w11
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %load.A = load <4 x i8>, ptr %A
   %load.B = load <4 x i16>, ptr %B
   %zext.A = zext <4 x i8> %load.A to <4 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index a1e06936852726..bc399c8d4ff071 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast -aarch64-enable-sink-fold=true | FileCheck %s --check-prefix=CHECK-SDAG
-; RUN: llc < %s -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast -aarch64-enable-sink-fold=true | FileCheck %s --check-prefix=CHECK-GISEL
+; RUN: llc < %s -global-isel -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast -aarch64-enable-sink-fold=true | FileCheck %s --check-prefix=CHECK-GISEL
 
 define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
 ; CHECK-SDAG-LABEL: test_varidx_extract_v8s8:
@@ -29,20 +29,20 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GISEL-NEXT:    mov b1, v0.b[1]
 ; CHECK-GISEL-NEXT:    add x8, sp, #8
-; CHECK-GISEL-NEXT:    str d0, [sp, #8]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x7
-; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
+; CHECK-GISEL-NEXT:    str d0, [sp, #8]
 ; CHECK-GISEL-NEXT:    mov b3, v0.b[2]
 ; CHECK-GISEL-NEXT:    lsl x10, x9, #1
 ; CHECK-GISEL-NEXT:    mov b0, v0.b[3]
 ; CHECK-GISEL-NEXT:    sub x9, x10, x9
-; CHECK-GISEL-NEXT:    ldrb w8, [x8, x9]
-; CHECK-GISEL-NEXT:    fmov s1, w8
-; CHECK-GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GISEL-NEXT:    fmov d0, d1
+; CHECK-GISEL-NEXT:    ldr b2, [x8, x9]
+; CHECK-GISEL-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GISEL-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GISEL-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GISEL-NEXT:    ushll v0.8h, v2.8b, #0
+; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
   %tmp = extractelement <8 x i8> %x, i32 %idx
@@ -176,17 +176,15 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
 ; CHECK-GISEL:       // %bb.0:
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov w8, #2 // =0x2
-; CHECK-GISEL-NEXT:    add x10, sp, #8
-; CHECK-GISEL-NEXT:    and x9, x9, #0x3
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GISEL-NEXT:    mov w9, w0
+; CHECK-GISEL-NEXT:    mov h1, v0.h[1]
+; CHECK-GISEL-NEXT:    add x8, sp, #8
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
-; CHECK-GISEL-NEXT:    madd x8, x9, x8, x10
-; CHECK-GISEL-NEXT:    umov w9, v0.h[1]
-; CHECK-GISEL-NEXT:    fmov s1, w9
-; CHECK-GISEL-NEXT:    ldr h0, [x8]
-; CHECK-GISEL-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GISEL-NEXT:    and x9, x9, #0x3
+; CHECK-GISEL-NEXT:    ldr h0, [x8, x9, lsl #1]
+; CHECK-GISEL-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GISEL-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index b7c02d6855a9e9..e0851fd8739ec9 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -1,14 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
 
-; CHECK-GI:       warning: Instruction selection used fallback path for bitcast_i32_v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_i32_v2i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v2i16_v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v4i8_v2i16
-
 define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-SD-LABEL: foo1:
 ; CHECK-SD:       // %bb.0:
@@ -80,12 +75,26 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
 }
 
 define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
-; CHECK-LABEL: bitcast_i32_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_i32_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, w1
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_i32_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %c = add i32 %a, %b
   %d = bitcast i32 %c to <4 x i8>
   ret <4 x i8> %d
@@ -119,13 +128,23 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
 }
 
 define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){
-; CHECK-LABEL: bitcast_i32_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_i32_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, w1
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_i32_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %c = add i32 %a, %b
   %d = bitcast i32 %c to <2 x i16>
   ret <2 x i16> %d
@@ -382,40 +401,72 @@ define <8 x i16> @bitcast_v16i8_v8i16(<16 x i8> %a, <16 x i8> %b){
 ; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
 
 define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){
-; CHECK-LABEL: bitcast_v2i16_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    ldr s0, [sp, #12]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v2i16_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [sp, #12]
+; CHECK-SD-NEXT:    strh w8, [sp, #14]
+; CHECK-SD-NEXT:    ldr s0, [sp, #12]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v2i16_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
   %d = bitcast <2 x i16> %c to <4 x i8>
   ret <4 x i8> %d
 }
 
 define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
-; CHECK-LABEL: bitcast_v4i8_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    add x8, sp, #12
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [sp, #12]
-; CHECK-NEXT:    ld1 { v0.h }[0], [x8]
-; CHECK-NEXT:    orr x8, x8, #0x2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v4i8_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    add x8, sp, #12
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [sp, #12]
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x8]
+; CHECK-SD-NEXT:    orr x8, x8, #0x2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v4i8_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %c = add <4 x i8> %a, %b
   %d = bitcast <4 x i8> %c to <2 x i16>
   ret <2 x i16> %d

diff  --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index f4221accfcbc52..071613b9cc011e 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for bswap_v2i16
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; ====== Scalar Tests =====
 define i16 @bswap_i16(i16 %a){
@@ -103,11 +101,23 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 ; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
 
 define <2 x i16> @bswap_v2i16(<2 x i16> %a){
-; CHECK-LABEL: bswap_v2i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.8b, v0.8b
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bswap_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
   ret <2 x i16> %res

diff  --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 7af01b53dae7e8..01585d02adcb13 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1,13 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
-
-; CHECK-GI-FP16:       warning: Instruction selection used fallback path for fptos_v2f16_v2i16
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for fptou_v2f16_v2i16
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for fptos_v2f16_v2i8
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for fptou_v2f16_v2i8
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define i64 @fptos_f64_i64(double %a) {
 ; CHECK-LABEL: fptos_f64_i64:
@@ -5184,8 +5179,13 @@ define <2 x i16> @fptos_v2f16_v2i16(<2 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptos_v2f16_v2i16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5212,8 +5212,13 @@ define <2 x i16> @fptou_v2f16_v2i16(<2 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptou_v2f16_v2i16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5658,8 +5663,13 @@ define <2 x i8> @fptos_v2f16_v2i8(<2 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptos_v2f16_v2i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5686,8 +5696,13 @@ define <2 x i8> @fptou_v2f16_v2i8(<2 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptou_v2f16_v2i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index ceac37f91238a8..f5a7b5dc9f4922 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4,13 +4,6 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-; CHECK-GI:       warning: Instruction selection used fallback path for stofp_v3i8_v3f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i8_v3f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f32
-; CHECK-GI-NOFP16-NEXT:  warning: Instruction selection used fallback path for stofp_v3i8_v3f16
-; CHECK-GI-NOFP16-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f16
-
 define double @stofp_i64_f64(i64 %a) {
 ; CHECK-LABEL: stofp_i64_f64:
 ; CHECK:       // %bb.0: // %entry
@@ -1754,47 +1747,109 @@ entry:
 }
 
 define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) {
-; CHECK-LABEL: stofp_v3i8_v3f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    fmov s1, w2
-; CHECK-NEXT:    mov v0.s[1], w1
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
-; CHECK-NEXT:    scvtf v2.2d, v1.2d
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i8_v3f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w2
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    scvtf v2.2d, v1.2d
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i8_v3f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    smov x9, v0.s[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    smov x9, v0.s[1]
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    scvtf v0.2d, v1.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x double>
   ret <3 x double> %c
 }
 
 define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) {
-; CHECK-LABEL: utofp_v3i8_v3f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    mov v0.s[1], w1
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v1.2d
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i8_v3f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    fmov s2, w2
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    and v1.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    ucvtf v2.2d, v1.2d
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i8_v3f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    ucvtf v0.2d, v1.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x double>
   ret <3 x double> %c
@@ -3372,31 +3427,71 @@ entry:
 }
 
 define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
-; CHECK-LABEL: stofp_v3i8_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i8_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i8_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x float>
   ret <3 x float> %c
 }
 
 define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
-; CHECK-LABEL: utofp_v3i8_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i8_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i8_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x float>
   ret <3 x float> %c
@@ -5582,12 +5677,18 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT:    fmov w9, s0
-; CHECK-GI-FP16-NEXT:    and w9, w9, #0xff
-; CHECK-GI-FP16-NEXT:    and w8, w8, #0xff
-; CHECK-GI-FP16-NEXT:    ucvtf h0, w9
-; CHECK-GI-FP16-NEXT:    ucvtf h1, w8
+; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
@@ -5622,11 +5723,20 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NOFP16-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5635,11 +5745,10 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
 ; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5671,10 +5780,20 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
-; CHECK-GI-NOFP16-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NOFP16-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5683,11 +5802,10 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
 ; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    movi d1, #0xff00ff00ff00ff
-; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 39143e5c53ffd1..c3c0ec5e3d9d8d 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -159,7 +159,8 @@ define <2 x i16> @load_v2i16(ptr %ptr){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
 ; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <2 x i16>, ptr %ptr

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 01620652301ed4..57f220f621cf8b 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1117,8 +1117,15 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_zero_v4i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI84_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI84_0]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov w9, #0 // =0x0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v3.16b, v1.16b
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v1.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
@@ -1137,8 +1144,16 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_zero_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI85_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI85_0]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov w9, #0 // =0x0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v3.16b, v1.16b
+; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v1.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -1181,8 +1196,15 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_v4i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI87_0
-; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI87_0]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov w9, #0 // =0x0
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    mov v4.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v4.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v4.b[3], v2.b[0]
+; CHECK-GI-NEXT:    ushll v2.8h, v4.8b, #0
 ; CHECK-GI-NEXT:    shl v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -1201,8 +1223,16 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI88_0
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI88_0]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov w9, #0 // =0x0
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    mov v4.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31
 ; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #31
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 5287839ee7b705..9c8d3e0f07de87 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1,13 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for shl_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v2i16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i1 @shl_i1(i1 %0, i1 %1){
 ; CHECK-SD-LABEL: shl_i1:
@@ -530,11 +523,38 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %0, <2 x i64> %1){
 ; ===== Vector Larger/Smaller than Legal =====
 
 define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
-; CHECK-LABEL: shl_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NEXT:    mov h6, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov h2, v1.h[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = shl <4 x i8> %0, %1
     ret <4 x i8> %3
 }
@@ -556,12 +576,27 @@ define <32 x i8> @shl_v32i8(<32 x i8> %0, <32 x i8> %1){
 }
 
 define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
-; CHECK-LABEL: shl_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = shl <2 x i16> %0, %1
     ret <2 x i16> %3
 }
@@ -633,14 +668,42 @@ define <4 x i64> @shl_v4i64(<4 x i64> %0, <4 x i64> %1){
 }
 
 define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
-; CHECK-LABEL: ashr_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    neg v1.4h, v1.4h
-; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NEXT:    mov h4, v1.h[2]
+; CHECK-GI-NEXT:    mov h5, v1.h[3]
+; CHECK-GI-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    neg v1.8b, v1.8b
+; CHECK-GI-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = ashr <4 x i8> %0, %1
     ret <4 x i8> %3
 }
@@ -658,15 +721,31 @@ define <32 x i8> @ashr_v32i8(<32 x i8> %0, <32 x i8> %1){
 }
 
 define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
-; CHECK-LABEL: ashr_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    neg v1.2s, v1.2s
-; CHECK-NEXT:    sshl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    neg v1.2s, v1.2s
+; CHECK-SD-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s3, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = ashr <2 x i16> %0, %1
     ret <2 x i16> %3
 }
@@ -727,13 +806,41 @@ define <4 x i64> @ashr_v4i64(<4 x i64> %0, <4 x i64> %1){
 }
 
 define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
-; CHECK-LABEL: lshr_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    neg v1.4h, v1.4h
-; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NEXT:    mov h4, v1.h[2]
+; CHECK-GI-NEXT:    mov h5, v1.h[3]
+; CHECK-GI-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    neg v1.8b, v1.8b
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = lshr <4 x i8> %0, %1
     ret <4 x i8> %3
 }
@@ -751,14 +858,30 @@ define <32 x i8> @lshr_v32i8(<32 x i8> %0, <32 x i8> %1){
 }
 
 define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
-; CHECK-LABEL: lshr_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    neg v1.2s, v1.2s
-; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    neg v1.2s, v1.2s
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s3, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = lshr <2 x i16> %0, %1
     ret <2 x i16> %3
 }


        


More information about the llvm-commits mailing list