[llvm] 35f2caf - [AArch64][GlobalISel] Select TBL/TBX Intrinsics (#92914)

via llvm-commits llvm-commits at lists.llvm.org
Wed May 29 05:14:08 PDT 2024


Author: chuongg3
Date: 2024-05-29T13:14:03+01:00
New Revision: 35f2caf713489049cc1b31aa3fe0a054968f80e3

URL: https://github.com/llvm/llvm-project/commit/35f2caf713489049cc1b31aa3fe0a054968f80e3
DIFF: https://github.com/llvm/llvm-project/commit/35f2caf713489049cc1b31aa3fe0a054968f80e3.diff

LOG: [AArch64][GlobalISel] Select TBL/TBX Intrinsics (#92914)

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
    llvm/test/CodeGen/AArch64/arm64-tbl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3b3c1fc8b27bf..4a7c82b393c10 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -227,6 +227,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
+  void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
+                   unsigned Opc1, unsigned Opc2, bool isExt);
 
   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -6537,6 +6539,25 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case Intrinsic::aarch64_neon_tbl2:
+    SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl3:
+    SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
+                false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl4:
+    SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbx2:
+    SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx3:
+    SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx4:
+    SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
+    return true;
   case Intrinsic::swift_async_context_addr:
     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
                               {Register(AArch64::FP)})
@@ -6552,6 +6573,30 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
   return false;
 }
 
+void AArch64InstructionSelector::SelectTable(MachineInstr &I,
+                                             MachineRegisterInfo &MRI,
+                                             unsigned NumVec, unsigned Opc1,
+                                             unsigned Opc2, bool isExt) {
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
+
+  // Create the REG_SEQUENCE
+  SmallVector<Register, 4> Regs;
+  for (unsigned i = 0; i < NumVec; i++)
+    Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
+  Register RegSeq = createQTuple(Regs, MIB);
+
+  Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
+  MachineInstrBuilder Instr;
+  if (isExt) {
+    Register Reg = I.getOperand(2).getReg();
+    Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
+  } else
+    Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
+  constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
+  I.eraseFromParent();
+}
+
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
   auto MaybeImmed = getImmedFromMO(Root);

diff  --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 96b2af7274b5b..44b92e6ccd088 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -1,28 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for tbl2_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl2_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_16b
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
 ; CHECK-LABEL: tbl1_8b:
@@ -43,175 +21,378 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
 }
 
 define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK-LABEL: tbl2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK-LABEL: tbl2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbl3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbl3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbl4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbl4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
-; CHECK-LABEL: .LCPI8_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
+; CHECK-SD-LABEL: .LCPI8_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI8_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   13                              // 0xd
+; CHECK-GI-NEXT:         .byte   14                              // 0xe
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-LABEL: .LCPI8_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v4
-; CHECK-NEXT:    tbl.8b v1, { v2, v3 }, v4
-; CHECK-NEXT:    mov.s v0[1], v1[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-SD-NEXT:    mov.s v0[1], v1[1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr d4, [x8, :lo12:.LCPI8_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI9_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    16                              // 0x10
-; CHECK-NEXT:     .byte    20                              // 0x14
-; CHECK-NEXT:     .byte    24                              // 0x18
-; CHECK-NEXT:     .byte    28                              // 0x1c
-; CHECK-NEXT:     .byte   32                              // 0x20
-; CHECK-NEXT:     .byte   36                              // 0x24
-; CHECK-NEXT:     .byte   40                              // 0x28
-; CHECK-NEXT:     .byte   44                              // 0x2c
-; CHECK-NEXT:     .byte   48                              // 0x30
-; CHECK-NEXT:     .byte   52                              // 0x34
-; CHECK-NEXT:     .byte   56                              // 0x38
-; CHECK-NEXT:     .byte   60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI9_0:
+; CHECK-SD-NEXT:     .byte    0                               // 0x0
+; CHECK-SD-NEXT:     .byte    4                               // 0x4
+; CHECK-SD-NEXT:     .byte    8                               // 0x8
+; CHECK-SD-NEXT:     .byte    12                              // 0xc
+; CHECK-SD-NEXT:     .byte    16                              // 0x10
+; CHECK-SD-NEXT:     .byte    20                              // 0x14
+; CHECK-SD-NEXT:     .byte    24                              // 0x18
+; CHECK-SD-NEXT:     .byte    28                              // 0x1c
+; CHECK-SD-NEXT:     .byte   32                              // 0x20
+; CHECK-SD-NEXT:     .byte   36                              // 0x24
+; CHECK-SD-NEXT:     .byte   40                              // 0x28
+; CHECK-SD-NEXT:     .byte   44                              // 0x2c
+; CHECK-SD-NEXT:     .byte   48                              // 0x30
+; CHECK-SD-NEXT:     .byte   52                              // 0x34
+; CHECK-SD-NEXT:     .byte   56                              // 0x38
+; CHECK-SD-NEXT:     .byte   60                              // 0x3c
+
+;CHECK-GI-LABEL: .LCPI9_0:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   1                               // 0x1
+;CHECK-GI-NEXT:         .byte   2                               // 0x2
+;CHECK-GI-NEXT:         .byte   3                               // 0x3
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   5                               // 0x5
+;CHECK-GI-NEXT:         .byte   6                               // 0x6
+;CHECK-GI-NEXT:         .byte   7                               // 0x7
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   17                              // 0x11
+;CHECK-GI-NEXT:         .byte   18                              // 0x12
+;CHECK-GI-NEXT:         .byte   19                              // 0x13
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   21                              // 0x15
+;CHECK-GI-NEXT:         .byte   22                              // 0x16
+;CHECK-GI-NEXT:         .byte   23                              // 0x17
+;CHECK-GI-LABEL: .LCPI9_1:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   8                               // 0x8
+;CHECK-GI-NEXT:         .byte   12                              // 0xc
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   24                              // 0x18
+;CHECK-GI-NEXT:         .byte   28                              // 0x1c
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI9_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI9_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI10_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI10_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #60 // =0x3c
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s4, w0
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #60 // =0x3c
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -234,40 +415,111 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI11_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI11_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w8
-; CHECK-NEXT:    mov.b v4[2], w8
-; CHECK-NEXT:    mov.b v4[3], w8
-; CHECK-NEXT:    mov.b v4[4], w8
-; CHECK-NEXT:    mov.b v4[5], w8
-; CHECK-NEXT:    mov.b v4[6], w8
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    fmov s4, w8
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w8
+; CHECK-SD-NEXT:    mov.b v4[2], w8
+; CHECK-SD-NEXT:    mov.b v4[3], w8
+; CHECK-SD-NEXT:    mov.b v4[4], w8
+; CHECK-SD-NEXT:    mov.b v4[5], w8
+; CHECK-SD-NEXT:    mov.b v4[6], w8
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #31 // =0x1f
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s6, w0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
+; CHECK-GI-NEXT:    mov.b v5[15], v6[0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
@@ -290,29 +542,116 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI12_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI12_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI12_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v4, #0xffffffffffffffff
-; CHECK-NEXT:    adrp x8, .LCPI12_0
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    mov.b v4[0], w0
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT:    mov.d v2[1], v0[0]
-; CHECK-NEXT:    mov.16b v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.2d v4, #0xffffffffffffffff
+; CHECK-SD-NEXT:    adrp x8, .LCPI12_0
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    mov.b v4[0], w0
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    mov.d v2[1], v0[0]
+; CHECK-SD-NEXT:    mov.16b v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -335,29 +674,133 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI13_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-LABEL: .LCPI13_1:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   1                               // 0x1
+; CHECK-SD-NEXT:         .byte   2                               // 0x2
+; CHECK-SD-NEXT:         .byte   3                               // 0x3
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   5                               // 0x5
+; CHECK-SD-NEXT:         .byte   6                               // 0x6
+; CHECK-SD-NEXT:         .byte   7                               // 0x7
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   17                              // 0x11
+; CHECK-SD-NEXT:         .byte   18                              // 0x12
+; CHECK-SD-NEXT:         .byte   19                              // 0x13
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   21                              // 0x15
+; CHECK-SD-NEXT:         .byte   30                              // 0x1e
+; CHECK-SD-NEXT:         .byte   31                              // 0x1f
+
+; CHECK-GI-LABEL: .LCPI13_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   30                              // 0x1e
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI13_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup.16b v4, w0
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    adrp x8, .LCPI13_0
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    adrp x8, .LCPI13_1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    tbl.16b v3, { v0, v1 }, v4
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
-; CHECK-NEXT:    tbl.16b v0, { v2, v3 }, v0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup.16b v4, w0
+; CHECK-SD-NEXT:    mov w8, #255 // =0xff
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_0
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    tbl.16b v3, { v0, v1 }, v4
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
+; CHECK-SD-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    fmov s6, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[8], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -380,106 +823,293 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI14_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
 
-; CHECK-LABEL: .LCPI14_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-GI-LABEL: .LCPI14_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI14_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI14_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI15_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI15_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI15_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI15_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI15_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI15_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI15_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI16_0:
-; CHECK-NEXT: 	.byte	0                               // 0x0
-; CHECK-NEXT: 	.byte	4                               // 0x4
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	12                              // 0xc
-; CHECK-NEXT: 	.byte	16                              // 0x10
-; CHECK-NEXT: 	.byte	20                              // 0x14
-; CHECK-NEXT: 	.byte	24                              // 0x18
-; CHECK-NEXT: 	.byte	28                              // 0x1c
-; CHECK-NEXT: 	.byte	32                              // 0x20
-; CHECK-NEXT: 	.byte	36                              // 0x24
-; CHECK-NEXT: 	.byte	40                              // 0x28
-; CHECK-NEXT: 	.byte	44                              // 0x2c
-; CHECK-NEXT: 	.byte	48                              // 0x30
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	56                              // 0x38
-; CHECK-NEXT: 	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI16_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI16_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI16_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI16_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI16_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI16_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -514,73 +1144,121 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
 }
 
 define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbx2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.8b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbx2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.16b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbx3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbx3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK-LABEL: tbx4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK-LABEL: tbx4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
   ret <16 x i8> %tmp3
 }
@@ -594,6 +1272,3 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>,
 declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
 declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}


        


More information about the llvm-commits mailing list