[llvm] [AArch64] Fold tbl1 intrinsic into dup when broadcasting (PR #170872)

Fri Dec 5 07:54:44 PST 2025

https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/170872

>From 2a4c0d009b2ef4fa9103d4161fdb7f62ea6ade14 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 5 Dec 2025 14:31:38 +0000
Subject: [PATCH 1/2] [AArch64][NFC] Add test for tbl1 intrinsic used for
 broadcasting

---
 llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll | 169 +++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll

diff --git a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
new file mode 100644
index 0000000000000..fb4c5662cbe8f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11>)
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.8h, #1, lsl #8
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <8 x i16> %v to <16 x i8>
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %res = bitcast <16 x i8> %tbl to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <8 x i16> %v to <16 x i8>
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11>)
+  %res = bitcast <16 x i8> %tbl to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i64> @tbl_v16i8_broadcast_i64_lane1(<2 x i64> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i64_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <2 x i64> %v to <16 x i8>
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  %res = bitcast <16 x i8> %tbl to <2 x i64>
+  ret <2 x i64> %res
+}
+
+; Negative tests - should NOT be converted to DUP
+
+define <4 x i32> @tbl_not_broadcast_mixed(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_mixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI5_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  ; Mixed pattern - alternates between lanes 2 and 0
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3>)
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_all_negative(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_all_negative:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  ; All indices are out-of-bounds (0xFF = -1)
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> splat (i8 -1))
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_some_negative(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_some_negative:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI7_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  ; Mix of valid broadcast pattern with negatives - should not optimize
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_reverse(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_reverse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+  %bc = bitcast <4 x i32> %v to <16 x i8>
+  ; Byte-reversed within element - not a simple broadcast
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8>)
+  %res = bitcast <16 x i8> %tbl to <4 x i32>
+  ret <4 x i32> %res
+}
+
+; Shufflevector-based tests
+
+define <4 x i32> @shuffle_bitcast_broadcast_i32_lane2(<4 x i32> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i32_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, v0.s[2]
+; CHECK-NEXT:    ret
+  %bc1 = bitcast <4 x i32> %v to <16 x i8>
+  ; Broadcast bytes 8-11 (lane 2 of i32) across all positions
+  %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+    <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11,
+                i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+  %bc2 = bitcast <16 x i8> %shuf to <4 x i32>
+  ret <4 x i32> %bc2
+}
+
+define <8 x i16> @shuffle_bitcast_broadcast_i16_lane3(<8 x i16> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i16_lane3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[3]
+; CHECK-NEXT:    ret
+  %bc1 = bitcast <8 x i16> %v to <16 x i8>
+  ; Broadcast bytes 6-7 (lane 3 of i16) across all positions
+  %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+    <16 x i32> <i32 6, i32 7, i32 6, i32 7, i32 6, i32 7, i32 6, i32 7,
+                i32 6, i32 7, i32 6, i32 7, i32 6, i32 7, i32 6, i32 7>
+  %bc2 = bitcast <16 x i8> %shuf to <8 x i16>
+  ret <8 x i16> %bc2
+}
+
+define <2 x i64> @shuffle_bitcast_broadcast_i64_lane0(<2 x i64> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i64_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %bc1 = bitcast <2 x i64> %v to <16 x i8>
+  ; Broadcast bytes 0-7 (lane 0 of i64) across both halves
+  %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %bc2 = bitcast <16 x i8> %shuf to <2 x i64>
+  ret <2 x i64> %bc2
+}
+
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)

>From bcc6e170514a38454901f5e6ad42563202fc582d Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 5 Dec 2025 15:28:23 +0000
Subject: [PATCH 2/2] [AArch64] Fold tbl1 intrinsic into dup when broadcasting

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 83 +++++++++++++++----
 llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll  | 19 ++---
 2 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7a15d7b75f1b9..866e7f96609e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14648,6 +14648,31 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
   return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
 }
 
+/// Try to lower a vector with all elements broadcasting a single lane to a DUP
+/// instruction by trying different lane sizes (64, 32, 16 bits).
+static SDValue tryLowerToWideDUP(ArrayRef<int> Indices, EVT IndicesVT,
+                                 SDValue Data, EVT ResultVT, const SDLoc &DL,
+                                 SelectionDAG &DAG) {
+  for (unsigned LaneSize : {64U, 32U, 16U}) {
+    unsigned Lane = 0;
+    if (isWideDUPMask(Indices, IndicesVT, LaneSize, Lane)) {
+      unsigned Opcode = LaneSize == 64   ? AArch64ISD::DUPLANE64
+                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
+                                         : AArch64ISD::DUPLANE16;
+      // Cast Data to an integer vector with required lane size
+      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
+      unsigned NewEltCount = Data.getValueSizeInBits() / LaneSize;
+      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
+      SDValue CastVec = DAG.getBitcast(NewVecTy, Data);
+      // Construct the DUP instruction
+      SDValue Dup = constructDup(CastVec, Lane, DL, NewVecTy, Opcode, DAG);
+      // Cast back to the original type
+      return DAG.getBitcast(ResultVT, Dup);
+    }
+  }
+  return SDValue();
+}
+
 // Try to widen element type to get a new mask value for a better permutation
 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
 // UZP1/2, TRN1/2, REV, INS, etc.
@@ -14843,23 +14868,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   // Check if the mask matches a DUP for a wider element
-  for (unsigned LaneSize : {64U, 32U, 16U}) {
-    unsigned Lane = 0;
-    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
-      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
-                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
-                                                        : AArch64ISD::DUPLANE16;
-      // Cast V1 to an integer vector with required lane size
-      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
-      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
-      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
-      V1 = DAG.getBitcast(NewVecTy, V1);
-      // Construct the DUP instruction
-      V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
-      // Cast back to the original type
-      return DAG.getBitcast(VT, V1);
-    }
-  }
+  if (SDValue WideDup = tryLowerToWideDUP(ShuffleMask, VT, V1, VT, DL, DAG))
+    return WideDup;
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSize = VT.getScalarSizeInBits();
@@ -23178,6 +23188,41 @@ static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   }
 }
 
+// Try to convert TBL instructions that broadcast a single element to DUP.
+static SDValue tryConvertTBLToDUP(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+         "Expected intrinsic node");
+
+  // TBL1 has the data in operand 1 and indices in operand 2
+  SDValue Data = N->getOperand(1);
+  SDValue Indices = N->getOperand(2);
+
+  // Only handle constant index vectors
+  const auto *IndicesNode = dyn_cast<BuildVectorSDNode>(Indices);
+  if (!IndicesNode)
+    return SDValue();
+
+  // TODO: handle tb1.v8i8 as well
+  EVT IndicesVT = Indices.getValueType();
+  if (IndicesVT != MVT::v16i8)
+    return SDValue();
+
+  const unsigned NumBytes = IndicesVT.getVectorNumElements();
+
+  // Extract all byte index values
+  SmallVector<int, 16> ByteIndices;
+  for (unsigned I = 0; I < NumBytes; ++I) {
+    const auto *Idx = dyn_cast<ConstantSDNode>(IndicesNode->getOperand(I));
+    if (!Idx || Idx->getSExtValue() < 0)
+      return SDValue();
+    ByteIndices.push_back(Idx->getSExtValue());
+  }
+
+  SDLoc DL(N);
+  EVT ResultVT = N->getValueType(0);
+  return tryLowerToWideDUP(ByteIndices, IndicesVT, Data, ResultVT, DL, DAG);
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -23238,6 +23283,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_uabd:
     return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_tbl1:
+    if (SDValue DUP = tryConvertTBLToDUP(N, DAG))
+      return DUP;
+    break;
   case Intrinsic::aarch64_neon_fcvtzs:
     return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
   case Intrinsic::aarch64_neon_fcvtzu:
diff --git a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
index fb4c5662cbe8f..5ce9614ad429b 100644
--- a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
+++ b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
@@ -4,9 +4,7 @@
 define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
 ; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
 ; CHECK-NEXT:    ret
   %bc = bitcast <4 x i32> %v to <16 x i8>
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
@@ -17,9 +15,7 @@ define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
 define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
 ; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    dup v0.4s, v0.s[2]
 ; CHECK-NEXT:    ret
   %bc = bitcast <4 x i32> %v to <16 x i8>
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11>)
@@ -30,8 +26,7 @@ define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
 define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
 ; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1, lsl #8
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
 ; CHECK-NEXT:    ret
   %bc = bitcast <8 x i16> %v to <16 x i8>
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
@@ -42,9 +37,7 @@ define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
 define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
 ; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    dup v0.8h, v0.h[5]
 ; CHECK-NEXT:    ret
   %bc = bitcast <8 x i16> %v to <16 x i8>
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11>)
@@ -55,9 +48,7 @@ define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
 define <2 x i64> @tbl_v16i8_broadcast_i64_lane1(<2 x i64> %v) {
 ; CHECK-LABEL: tbl_v16i8_broadcast_i64_lane1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI4_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-NEXT:    ret
   %bc = bitcast <2 x i64> %v to <16 x i8>
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)