[llvm] 267d6d6 - [AArch64] Use NEON's tbl1 for 16xi8 and 8xi8 build vector with mask.
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 29 07:26:36 PDT 2023
Author: Lawrence Benson
Date: 2023-03-29T15:26:28+01:00
New Revision: 267d6d665cf2379ebfcc65fa385a35529c83a7d0
URL: https://github.com/llvm/llvm-project/commit/267d6d665cf2379ebfcc65fa385a35529c83a7d0
DIFF: https://github.com/llvm/llvm-project/commit/267d6d665cf2379ebfcc65fa385a35529c83a7d0.diff
LOG: [AArch64] Use NEON's tbl1 for 16xi8 and 8xi8 build vector with mask.
When using Clang's __builtin_shufflevector with a 16xi8 or 8xi8 source and
runtime mask on an AArch64 target, LLVM currently generates 16 or 8
extract+and+insert operations. This patch replaces these inserts with (a vector
AND +) NEON's tbl1 intruction.
Issue: https://github.com/llvm/llvm-project/issues/60515
Differential Revision: https://reviews.llvm.org/D146212
Added:
llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bd896a88c5a06..6fb2ef07d0252 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10199,6 +10199,98 @@ static unsigned getExtFactor(SDValue &V) {
return EltType.getSizeInBits() / 8;
}
+// Check if a vector is built from one vector via extracted elements of
+// another together with an AND mask, ensuring that all elements fit
+// within range. This can be reconstructed using AND and NEON's TBL1.
+SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ assert(!VT.isScalableVector() &&
+ "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
+
+ // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
+ // directly to TBL1.
+ if (VT != MVT::v16i8 && VT != MVT::v8i8)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((NumElts == 8 || NumElts == 16) &&
+ "Need to have exactly 8 or 16 elements in vector.");
+
+ SDValue SourceVec;
+ SDValue MaskSourceVec;
+ SmallVector<SDValue, 16> AndMaskConstants;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue OperandSourceVec = V.getOperand(0);
+ if (!SourceVec)
+ SourceVec = OperandSourceVec;
+ else if (SourceVec != OperandSourceVec)
+ return SDValue();
+
+ // This only looks at shuffles with elements that are
+ // a) truncated by a constant AND mask extracted from a mask vector, or
+ // b) extracted directly from a mask vector.
+ SDValue MaskSource = V.getOperand(1);
+ if (MaskSource.getOpcode() == ISD::AND) {
+ if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
+ return SDValue();
+
+ AndMaskConstants.push_back(MaskSource.getOperand(1));
+ MaskSource = MaskSource->getOperand(0);
+ } else if (!AndMaskConstants.empty()) {
+ // Either all or no operands should have an AND mask.
+ return SDValue();
+ }
+
+ // An ANY_EXTEND may be inserted between the AND and the source vector
+ // extraction. We don't care about that, so we can just skip it.
+ if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
+ MaskSource = MaskSource.getOperand(0);
+
+ if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue MaskIdx = MaskSource.getOperand(1);
+ if (!isa<ConstantSDNode>(MaskIdx) ||
+ !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
+ return SDValue();
+
+ // We only apply this if all elements come from the same vector with the
+ // same vector type.
+ if (!MaskSourceVec) {
+ MaskSourceVec = MaskSource->getOperand(0);
+ if (MaskSourceVec.getValueType() != VT)
+ return SDValue();
+ } else if (MaskSourceVec != MaskSource->getOperand(0)) {
+ return SDValue();
+ }
+ }
+
+ // We need a v16i8 for TBL, so we extend the source with a placeholder vector
+ // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
+ // insert, we know that the index in the mask must be smaller than the number
+ // of elements in the source, or we would have an out-of-bounds access.
+ if (NumElts == 8)
+ SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
+ DAG.getUNDEF(VT));
+
+ // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
+ if (!AndMaskConstants.empty())
+ MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
+ DAG.getBuildVector(VT, dl, AndMaskConstants));
+
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
+ MaskSourceVec);
+}
+
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
@@ -12340,8 +12432,11 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
- if (SDValue shuffle = ReconstructShuffle(Op, DAG))
- return shuffle;
+ if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
+ return Shuffle;
+
+ if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
+ return Shuffle;
}
if (PreferDUPAndInsert) {
diff --git a/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll
new file mode 100644
index 0000000000000..64ba7b2b4ee94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll
@@ -0,0 +1,389 @@
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+
+; This is the IR generated by Clang's __builtin_shufflevector for two 16x uint8_t vectors.
+define <16 x i8> @shuffle16_with_and_mask(<16 x i8> %src, <16 x i8> %mask) {
+; CHECK-LABEL: shuffle16_with_and_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.16b v2, #15
+; CHECK-NEXT: and.16b v1, v1, v2
+; CHECK-NEXT: tbl.16b v0, { v0 }, v1
+; CHECK-NEXT: ret
+
+ %masked_mask = and <16 x i8> %mask, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+ %1 = extractelement <16 x i8> %masked_mask, i64 0
+ %2 = extractelement <16 x i8> %src, i8 %1
+ %3 = insertelement <16 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <16 x i8> %masked_mask, i64 1
+ %5 = extractelement <16 x i8> %src, i8 %4
+ %6 = insertelement <16 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <16 x i8> %masked_mask, i64 2
+ %8 = extractelement <16 x i8> %src, i8 %7
+ %9 = insertelement <16 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <16 x i8> %masked_mask, i64 3
+ %11 = extractelement <16 x i8> %src, i8 %10
+ %12 = insertelement <16 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <16 x i8> %masked_mask, i64 4
+ %14 = extractelement <16 x i8> %src, i8 %13
+ %15 = insertelement <16 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <16 x i8> %masked_mask, i64 5
+ %17 = extractelement <16 x i8> %src, i8 %16
+ %18 = insertelement <16 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <16 x i8> %masked_mask, i64 6
+ %20 = extractelement <16 x i8> %src, i8 %19
+ %21 = insertelement <16 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <16 x i8> %masked_mask, i64 7
+ %23 = extractelement <16 x i8> %src, i8 %22
+ %24 = insertelement <16 x i8> %21, i8 %23, i64 7
+ %25 = extractelement <16 x i8> %masked_mask, i64 8
+ %26 = extractelement <16 x i8> %src, i8 %25
+ %27 = insertelement <16 x i8> %24, i8 %26, i64 8
+ %28 = extractelement <16 x i8> %masked_mask, i64 9
+ %29 = extractelement <16 x i8> %src, i8 %28
+ %30 = insertelement <16 x i8> %27, i8 %29, i64 9
+ %31 = extractelement <16 x i8> %masked_mask, i64 10
+ %32 = extractelement <16 x i8> %src, i8 %31
+ %33 = insertelement <16 x i8> %30, i8 %32, i64 10
+ %34 = extractelement <16 x i8> %masked_mask, i64 11
+ %35 = extractelement <16 x i8> %src, i8 %34
+ %36 = insertelement <16 x i8> %33, i8 %35, i64 11
+ %37 = extractelement <16 x i8> %masked_mask, i64 12
+ %38 = extractelement <16 x i8> %src, i8 %37
+ %39 = insertelement <16 x i8> %36, i8 %38, i64 12
+ %40 = extractelement <16 x i8> %masked_mask, i64 13
+ %41 = extractelement <16 x i8> %src, i8 %40
+ %42 = insertelement <16 x i8> %39, i8 %41, i64 13
+ %43 = extractelement <16 x i8> %masked_mask, i64 14
+ %44 = extractelement <16 x i8> %src, i8 %43
+ %45 = insertelement <16 x i8> %42, i8 %44, i64 14
+ %46 = extractelement <16 x i8> %masked_mask, i64 15
+ ; Make sure that ANY_EXTEND is ignored
+ %47 = zext i8 %46 to i32
+ %48 = extractelement <16 x i8> %src, i32 %47
+ %49 = insertelement <16 x i8> %45, i8 %48, i64 15
+ ret <16 x i8> %49
+}
+
+define <8 x i8> @shuffle8_with_and_mask(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: shuffle8_with_and_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.8b v2, #7
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: and.8b v1, v1, v2
+; CHECK-NEXT: tbl.8b v0, { v0 }, v1
+; CHECK-NEXT: ret
+
+ %masked_mask = and <8 x i8> %mask, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %1 = extractelement <8 x i8> %masked_mask, i64 0
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %masked_mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <8 x i8> %masked_mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %masked_mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <8 x i8> %masked_mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <8 x i8> %masked_mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <8 x i8> %masked_mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <8 x i8> %masked_mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
+
+define <8 x i8> @shuffle8_with_and_mask_
diff erent_constants(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: LCPI2_0:
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 7
+
+; CHECK-LABEL: shuffle8_with_and_mask_
diff erent_constants:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: and.8b v1, v1, v2
+; CHECK-NEXT: tbl.8b v0, { v0 }, v1
+; CHECK-NEXT: ret
+
+ %masked_mask = and <8 x i8> %mask, <i8 3, i8 1, i8 7, i8 1, i8 7, i8 3, i8 7, i8 7>
+ %1 = extractelement <8 x i8> %masked_mask, i64 0
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %masked_mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <8 x i8> %masked_mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %masked_mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <8 x i8> %masked_mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <8 x i8> %masked_mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <8 x i8> %masked_mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <8 x i8> %masked_mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
+
+define <8 x i8> @shuffle8_with_mask(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: shuffle8_with_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: tbl.8b v0, { v0 }, v1
+; CHECK-NEXT: ret
+
+ %1 = extractelement <8 x i8> %mask, i64 0
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <8 x i8> %mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <8 x i8> %mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <8 x i8> %mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <8 x i8> %mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <8 x i8> %mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
+
+define <8 x i8> @no_shuffle_only_some_and_constants(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: no_shuffle_only_some_and_constants:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ ; Element at 0 has a AND mask, element at 1 does not.
+ %1 = extractelement <8 x i8> %mask, i64 0
+ %masked_elt1 = and i8 %1, 7
+ %2 = extractelement <8 x i8> %src, i8 %masked_elt1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+
+ %7 = extractelement <8 x i8> %mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <8 x i8> %mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <8 x i8> %mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <8 x i8> %mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <8 x i8> %mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
+
+; Takes alternating entries from two mask source vectors. Currently not supported.
+define <16 x i8> @no_shuffle_with_two_mask_sources(<16 x i8> %src, <16 x i8> %mask1, <16 x i8> %mask2) {
+; CHECK-LABEL: shuffle_with_two_mask_sources:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ %masked_mask1 = and <16 x i8> %mask1, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+ %masked_mask2 = and <16 x i8> %mask2, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+ %1 = extractelement <16 x i8> %masked_mask1, i64 0
+ %2 = extractelement <16 x i8> %src, i8 %1
+ %3 = insertelement <16 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <16 x i8> %masked_mask2, i64 1
+ %5 = extractelement <16 x i8> %src, i8 %4
+ %6 = insertelement <16 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <16 x i8> %masked_mask1, i64 2
+ %8 = extractelement <16 x i8> %src, i8 %7
+ %9 = insertelement <16 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <16 x i8> %masked_mask2, i64 3
+ %11 = extractelement <16 x i8> %src, i8 %10
+ %12 = insertelement <16 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <16 x i8> %masked_mask1, i64 4
+ %14 = extractelement <16 x i8> %src, i8 %13
+ %15 = insertelement <16 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <16 x i8> %masked_mask2, i64 5
+ %17 = extractelement <16 x i8> %src, i8 %16
+ %18 = insertelement <16 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <16 x i8> %masked_mask1, i64 6
+ %20 = extractelement <16 x i8> %src, i8 %19
+ %21 = insertelement <16 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <16 x i8> %masked_mask2, i64 7
+ %23 = extractelement <16 x i8> %src, i8 %22
+ %24 = insertelement <16 x i8> %21, i8 %23, i64 7
+ %25 = extractelement <16 x i8> %masked_mask1, i64 8
+ %26 = extractelement <16 x i8> %src, i8 %25
+ %27 = insertelement <16 x i8> %24, i8 %26, i64 8
+ %28 = extractelement <16 x i8> %masked_mask2, i64 9
+ %29 = extractelement <16 x i8> %src, i8 %28
+ %30 = insertelement <16 x i8> %27, i8 %29, i64 9
+ %31 = extractelement <16 x i8> %masked_mask1, i64 10
+ %32 = extractelement <16 x i8> %src, i8 %31
+ %33 = insertelement <16 x i8> %30, i8 %32, i64 10
+ %34 = extractelement <16 x i8> %masked_mask2, i64 11
+ %35 = extractelement <16 x i8> %src, i8 %34
+ %36 = insertelement <16 x i8> %33, i8 %35, i64 11
+ %37 = extractelement <16 x i8> %masked_mask1, i64 12
+ %38 = extractelement <16 x i8> %src, i8 %37
+ %39 = insertelement <16 x i8> %36, i8 %38, i64 12
+ %40 = extractelement <16 x i8> %masked_mask2, i64 13
+ %41 = extractelement <16 x i8> %src, i8 %40
+ %42 = insertelement <16 x i8> %39, i8 %41, i64 13
+ %43 = extractelement <16 x i8> %masked_mask1, i64 14
+ %44 = extractelement <16 x i8> %src, i8 %43
+ %45 = insertelement <16 x i8> %42, i8 %44, i64 14
+ %46 = extractelement <16 x i8> %masked_mask2, i64 15
+ %47 = extractelement <16 x i8> %src, i8 %46
+ %48 = insertelement <16 x i8> %45, i8 %47, i64 15
+ ret <16 x i8> %48
+}
+
+; Non-supported vector type.
+define <4 x i32> @no_for_shuffle_int_vector(<4 x i32> %src, <4 x i32> %mask) {
+; CHECK-LABEL: no_for_shuffle_int_vector:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ %masked_mask = and <4 x i32> %mask, <i32 3, i32 3, i32 3, i32 3>
+ %1 = extractelement <4 x i32> %masked_mask, i64 0
+ %2 = extractelement <4 x i32> %src, i32 %1
+ %3 = insertelement <4 x i32> undef, i32 %2, i64 0
+ %4 = extractelement <4 x i32> %masked_mask, i64 1
+ %5 = extractelement <4 x i32> %src, i32 %4
+ %6 = insertelement <4 x i32> %3, i32 %5, i64 1
+ %7 = extractelement <4 x i32> %masked_mask, i64 2
+ %8 = extractelement <4 x i32> %src, i32 %7
+ %9 = insertelement <4 x i32> %6, i32 %8, i64 2
+ %10 = extractelement <4 x i32> %masked_mask, i64 3
+ %11 = extractelement <4 x i32> %src, i32 %10
+ %12 = insertelement <4 x i32> %9, i32 %11, i64 3
+ ret <4 x i32> %12
+}
+
+define <8 x i8> @no_shuffle_not_enough_elements(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: no_shuffle_not_enough_elements:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ %masked_mask = and <8 x i8> %mask, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %1 = extractelement <8 x i8> %masked_mask, i64 0
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %masked_mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <8 x i8> %masked_mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %masked_mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ ret <8 x i8> %12
+}
+
+define <8 x i8> @no_shuffle_
diff erent_vector_types(<8 x i8> %src, <16 x i8> %mask) {
+; CHECK-LABEL: no_shuffle_
diff erent_vector_types:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ %masked_mask = and <16 x i8> %mask, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %1 = extractelement <16 x i8> %masked_mask, i64 0
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <16 x i8> %masked_mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <16 x i8> %masked_mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <16 x i8> %masked_mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <16 x i8> %masked_mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <16 x i8> %masked_mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <16 x i8> %masked_mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <16 x i8> %masked_mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
+
+define <8 x i8> @no_shuffle_bad_mask_index(<8 x i8> %src, <8 x i8> %mask) {
+; CHECK-LABEL: no_shuffle_bad_mask_index:
+; CHECK: // %bb.0:
+; CHECK-NOT: tbl.16b
+
+ %masked_mask = and <8 x i8> %mask, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+
+ ; This should extract at 0, but because it extracts at 1, the pattern does not match.
+ %1 = extractelement <8 x i8> %masked_mask, i64 1
+
+ %2 = extractelement <8 x i8> %src, i8 %1
+ %3 = insertelement <8 x i8> undef, i8 %2, i64 0
+ %4 = extractelement <8 x i8> %masked_mask, i64 1
+ %5 = extractelement <8 x i8> %src, i8 %4
+ %6 = insertelement <8 x i8> %3, i8 %5, i64 1
+ %7 = extractelement <8 x i8> %masked_mask, i64 2
+ %8 = extractelement <8 x i8> %src, i8 %7
+ %9 = insertelement <8 x i8> %6, i8 %8, i64 2
+ %10 = extractelement <8 x i8> %masked_mask, i64 3
+ %11 = extractelement <8 x i8> %src, i8 %10
+ %12 = insertelement <8 x i8> %9, i8 %11, i64 3
+ %13 = extractelement <8 x i8> %masked_mask, i64 4
+ %14 = extractelement <8 x i8> %src, i8 %13
+ %15 = insertelement <8 x i8> %12, i8 %14, i64 4
+ %16 = extractelement <8 x i8> %masked_mask, i64 5
+ %17 = extractelement <8 x i8> %src, i8 %16
+ %18 = insertelement <8 x i8> %15, i8 %17, i64 5
+ %19 = extractelement <8 x i8> %masked_mask, i64 6
+ %20 = extractelement <8 x i8> %src, i8 %19
+ %21 = insertelement <8 x i8> %18, i8 %20, i64 6
+ %22 = extractelement <8 x i8> %masked_mask, i64 7
+ %23 = extractelement <8 x i8> %src, i8 %22
+ %24 = insertelement <8 x i8> %21, i8 %23, i64 7
+ ret <8 x i8> %24
+}
More information about the llvm-commits
mailing list