[llvm] [AArch64] Fold tbl1 intrinsic into dup when broadcasting (PR #170872)
Valeriy Savchenko via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 5 07:54:44 PST 2025
https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/170872
>From 2a4c0d009b2ef4fa9103d4161fdb7f62ea6ade14 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 5 Dec 2025 14:31:38 +0000
Subject: [PATCH 1/2] [AArch64][NFC] Add test for tbl1 intrinsic used for
broadcasting
---
llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll | 169 +++++++++++++++++++
1 file changed, 169 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
diff --git a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
new file mode 100644
index 0000000000000..fb4c5662cbe8f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11>)
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.8h, #1, lsl #8
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <8 x i16> %v to <16 x i8>
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ %res = bitcast <16 x i8> %tbl to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <8 x i16> %v to <16 x i8>
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11>)
+ %res = bitcast <16 x i8> %tbl to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <2 x i64> @tbl_v16i8_broadcast_i64_lane1(<2 x i64> %v) {
+; CHECK-LABEL: tbl_v16i8_broadcast_i64_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <2 x i64> %v to <16 x i8>
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ %res = bitcast <16 x i8> %tbl to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; Negative tests - should NOT be converted to DUP
+
+define <4 x i32> @tbl_not_broadcast_mixed(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_mixed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI5_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ ; Mixed pattern - alternates between lanes 2 and 0
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3>)
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_all_negative(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_all_negative:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ ; All indices are out-of-bounds (0xFF = -1)
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> splat (i8 -1))
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_some_negative(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_some_negative:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI7_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ ; Mix of valid broadcast pattern with negatives - should not optimize
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @tbl_not_broadcast_reverse(<4 x i32> %v) {
+; CHECK-LABEL: tbl_not_broadcast_reverse:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: ret
+ %bc = bitcast <4 x i32> %v to <16 x i8>
+ ; Byte-reversed within element - not a simple broadcast
+ %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8, i8 11, i8 10, i8 9, i8 8>)
+ %res = bitcast <16 x i8> %tbl to <4 x i32>
+ ret <4 x i32> %res
+}
+
+; Shufflevector-based tests
+
+define <4 x i32> @shuffle_bitcast_broadcast_i32_lane2(<4 x i32> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i32_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.4s, v0.s[2]
+; CHECK-NEXT: ret
+ %bc1 = bitcast <4 x i32> %v to <16 x i8>
+ ; Broadcast bytes 8-11 (lane 2 of i32) across all positions
+ %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+ <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11,
+ i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+ %bc2 = bitcast <16 x i8> %shuf to <4 x i32>
+ ret <4 x i32> %bc2
+}
+
+define <8 x i16> @shuffle_bitcast_broadcast_i16_lane3(<8 x i16> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i16_lane3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.8h, v0.h[3]
+; CHECK-NEXT: ret
+ %bc1 = bitcast <8 x i16> %v to <16 x i8>
+ ; Broadcast bytes 6-7 (lane 3 of i16) across all positions
+ %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+ <16 x i32> <i32 6, i32 7, i32 6, i32 7, i32 6, i32 7, i32 6, i32 7,
+ i32 6, i32 7, i32 6, i32 7, i32 6, i32 7, i32 6, i32 7>
+ %bc2 = bitcast <16 x i8> %shuf to <8 x i16>
+ ret <8 x i16> %bc2
+}
+
+define <2 x i64> @shuffle_bitcast_broadcast_i64_lane0(<2 x i64> %v) {
+; CHECK-LABEL: shuffle_bitcast_broadcast_i64_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %bc1 = bitcast <2 x i64> %v to <16 x i8>
+ ; Broadcast bytes 0-7 (lane 0 of i64) across both halves
+ %shuf = shufflevector <16 x i8> %bc1, <16 x i8> poison,
+ <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc2 = bitcast <16 x i8> %shuf to <2 x i64>
+ ret <2 x i64> %bc2
+}
+
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
>From bcc6e170514a38454901f5e6ad42563202fc582d Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 5 Dec 2025 15:28:23 +0000
Subject: [PATCH 2/2] [AArch64] Fold tbl1 intrinsic into dup when broadcasting
---
.../Target/AArch64/AArch64ISelLowering.cpp | 83 +++++++++++++++----
llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll | 19 ++---
2 files changed, 71 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7a15d7b75f1b9..866e7f96609e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14648,6 +14648,31 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
}
+/// Try to lower a vector with all elements broadcasting a single lane to a DUP
+/// instruction by trying different lane sizes (64, 32, 16 bits).
+static SDValue tryLowerToWideDUP(ArrayRef<int> Indices, EVT IndicesVT,
+ SDValue Data, EVT ResultVT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ for (unsigned LaneSize : {64U, 32U, 16U}) {
+ unsigned Lane = 0;
+ if (isWideDUPMask(Indices, IndicesVT, LaneSize, Lane)) {
+ unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
+ : LaneSize == 32 ? AArch64ISD::DUPLANE32
+ : AArch64ISD::DUPLANE16;
+ // Cast Data to an integer vector with required lane size
+ MVT NewEltTy = MVT::getIntegerVT(LaneSize);
+ unsigned NewEltCount = Data.getValueSizeInBits() / LaneSize;
+ MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
+ SDValue CastVec = DAG.getBitcast(NewVecTy, Data);
+ // Construct the DUP instruction
+ SDValue Dup = constructDup(CastVec, Lane, DL, NewVecTy, Opcode, DAG);
+ // Cast back to the original type
+ return DAG.getBitcast(ResultVT, Dup);
+ }
+ }
+ return SDValue();
+}
+
// Try to widen element type to get a new mask value for a better permutation
// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
// UZP1/2, TRN1/2, REV, INS, etc.
@@ -14843,23 +14868,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
}
// Check if the mask matches a DUP for a wider element
- for (unsigned LaneSize : {64U, 32U, 16U}) {
- unsigned Lane = 0;
- if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
- unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
- : LaneSize == 32 ? AArch64ISD::DUPLANE32
- : AArch64ISD::DUPLANE16;
- // Cast V1 to an integer vector with required lane size
- MVT NewEltTy = MVT::getIntegerVT(LaneSize);
- unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
- MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
- V1 = DAG.getBitcast(NewVecTy, V1);
- // Construct the DUP instruction
- V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
- // Cast back to the original type
- return DAG.getBitcast(VT, V1);
- }
- }
+ if (SDValue WideDup = tryLowerToWideDUP(ShuffleMask, VT, V1, VT, DL, DAG))
+ return WideDup;
unsigned NumElts = VT.getVectorNumElements();
unsigned EltSize = VT.getScalarSizeInBits();
@@ -23178,6 +23188,41 @@ static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
}
}
+// Try to convert TBL instructions that broadcast a single element to DUP.
+static SDValue tryConvertTBLToDUP(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ "Expected intrinsic node");
+
+ // TBL1 has the data in operand 1 and indices in operand 2
+ SDValue Data = N->getOperand(1);
+ SDValue Indices = N->getOperand(2);
+
+ // Only handle constant index vectors
+ const auto *IndicesNode = dyn_cast<BuildVectorSDNode>(Indices);
+ if (!IndicesNode)
+ return SDValue();
+
+ // TODO: handle tb1.v8i8 as well
+ EVT IndicesVT = Indices.getValueType();
+ if (IndicesVT != MVT::v16i8)
+ return SDValue();
+
+ const unsigned NumBytes = IndicesVT.getVectorNumElements();
+
+ // Extract all byte index values
+ SmallVector<int, 16> ByteIndices;
+ for (unsigned I = 0; I < NumBytes; ++I) {
+ const auto *Idx = dyn_cast<ConstantSDNode>(IndicesNode->getOperand(I));
+ if (!Idx || Idx->getSExtValue() < 0)
+ return SDValue();
+ ByteIndices.push_back(Idx->getSExtValue());
+ }
+
+ SDLoc DL(N);
+ EVT ResultVT = N->getValueType(0);
+ return tryLowerToWideDUP(ByteIndices, IndicesVT, Data, ResultVT, DL, DAG);
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -23238,6 +23283,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_uabd:
return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_tbl1:
+ if (SDValue DUP = tryConvertTBLToDUP(N, DAG))
+ return DUP;
+ break;
case Intrinsic::aarch64_neon_fcvtzs:
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
case Intrinsic::aarch64_neon_fcvtzu:
diff --git a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
index fb4c5662cbe8f..5ce9614ad429b 100644
--- a/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
+++ b/llvm/test/CodeGen/AArch64/neon-tbl-to-dup.ll
@@ -4,9 +4,7 @@
define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane0:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI0_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%bc = bitcast <4 x i32> %v to <16 x i8>
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
@@ -17,9 +15,7 @@ define <4 x i32> @tbl_v16i8_broadcast_i32_lane0(<4 x i32> %v) {
define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
; CHECK-LABEL: tbl_v16i8_broadcast_i32_lane2:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI1_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: dup v0.4s, v0.s[2]
; CHECK-NEXT: ret
%bc = bitcast <4 x i32> %v to <16 x i8>
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11>)
@@ -30,8 +26,7 @@ define <4 x i32> @tbl_v16i8_broadcast_i32_lane2(<4 x i32> %v) {
define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane0:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8h, #1, lsl #8
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: dup v0.8h, v0.h[0]
; CHECK-NEXT: ret
%bc = bitcast <8 x i16> %v to <16 x i8>
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
@@ -42,9 +37,7 @@ define <8 x i16> @tbl_v16i8_broadcast_i16_lane0(<8 x i16> %v) {
define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
; CHECK-LABEL: tbl_v16i8_broadcast_i16_lane5:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: dup v0.8h, v0.h[5]
; CHECK-NEXT: ret
%bc = bitcast <8 x i16> %v to <16 x i8>
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11, i8 10, i8 11>)
@@ -55,9 +48,7 @@ define <8 x i16> @tbl_v16i8_broadcast_i16_lane5(<8 x i16> %v) {
define <2 x i64> @tbl_v16i8_broadcast_i64_lane1(<2 x i64> %v) {
; CHECK-LABEL: tbl_v16i8_broadcast_i64_lane1:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI4_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: dup v0.2d, v0.d[1]
; CHECK-NEXT: ret
%bc = bitcast <2 x i64> %v to <16 x i8>
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %bc, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
More information about the llvm-commits
mailing list