[llvm] 38275ab - [X86] Move lowerShuffleAsPermuteAndUnpack earlier in the source next to similar helpers. NFC.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 25 07:00:34 PST 2022
Author: Simon Pilgrim
Date: 2022-11-25T14:56:38Z
New Revision: 38275ab1b39868e4e4f18631891a0ea25b8a87bb
URL: https://github.com/llvm/llvm-project/commit/38275ab1b39868e4e4f18631891a0ea25b8a87bb
DIFF: https://github.com/llvm/llvm-project/commit/38275ab1b39868e4e4f18631891a0ea25b8a87bb.diff
LOG: [X86] Move lowerShuffleAsPermuteAndUnpack earlier in the source next to similar helpers. NFC.
I'm currently investigating using this inside lowerShuffleAsDecomposedShuffleMerge
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f293058abec0..1ce6207c8979 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13167,6 +13167,126 @@ static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
}
+/// Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(VT.is128BitVector() && "This routine only works on 128-bit vectors.");
+ assert(!V2.isUndef() &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs =
+ count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+ int NumHiInputs =
+ count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](int ScalarSize, int Scale) {
+ SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+ SmallVector<int, 16> V2Mask((unsigned)Size, -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ MVT UnpackVT =
+ MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
+ V1 = DAG.getBitcast(UnpackVT, V1);
+ V2 = DAG.getBitcast(UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getBitcast(
+ VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+ if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
+ return Unpack;
+
+ // If we're shuffling with a zero vector then we're better off not doing
+ // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
+ if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
+ ISD::isBuildVectorAllZeros(V2.getNode()))
+ return SDValue();
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask((unsigned)Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL,
+ DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
+ V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
+}
+
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
static SDValue lowerShuffleAsByteRotateAndPermute(
@@ -14846,123 +14966,6 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
-/// Try to lower a shuffle as a permute of the inputs followed by an
-/// UNPCK instruction.
-///
-/// This specifically targets cases where we end up with alternating between
-/// the two inputs, and so can permute them into something that feeds a single
-/// UNPCK instruction. Note that this routine only targets integer vectors
-/// because for floating point vectors we have a generalized SHUFPS lowering
-/// strategy that handles everything that doesn't *exactly* match an unpack,
-/// making this clever lowering unnecessary.
-static SDValue lowerShuffleAsPermuteAndUnpack(
- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
- assert(!VT.isFloatingPoint() &&
- "This routine only supports integer vectors.");
- assert(VT.is128BitVector() &&
- "This routine only works on 128-bit vectors.");
- assert(!V2.isUndef() &&
- "This routine should only be used when blending two inputs.");
- assert(Mask.size() >= 2 && "Single element masks are invalid.");
-
- int Size = Mask.size();
-
- int NumLoInputs =
- count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
- int NumHiInputs =
- count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
-
- bool UnpackLo = NumLoInputs >= NumHiInputs;
-
- auto TryUnpack = [&](int ScalarSize, int Scale) {
- SmallVector<int, 16> V1Mask((unsigned)Size, -1);
- SmallVector<int, 16> V2Mask((unsigned)Size, -1);
-
- for (int i = 0; i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
-
- // Each element of the unpack contains Scale elements from this mask.
- int UnpackIdx = i / Scale;
-
- // We only handle the case where V1 feeds the first slots of the unpack.
- // We rely on canonicalization to ensure this is the case.
- if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
- return SDValue();
-
- // Setup the mask for this input. The indexing is tricky as we have to
- // handle the unpack stride.
- SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
- VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
- Mask[i] % Size;
- }
-
- // If we will have to shuffle both inputs to use the unpack, check whether
- // we can just unpack first and shuffle the result. If so, skip this unpack.
- if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
- !isNoopShuffleMask(V2Mask))
- return SDValue();
-
- // Shuffle the inputs into place.
- V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
- V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
-
- // Cast the inputs to the type we will use to unpack them.
- MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
- V1 = DAG.getBitcast(UnpackVT, V1);
- V2 = DAG.getBitcast(UnpackVT, V2);
-
- // Unpack the inputs and cast the result back to the desired type.
- return DAG.getBitcast(
- VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- UnpackVT, V1, V2));
- };
-
- // We try each unpack from the largest to the smallest to try and find one
- // that fits this mask.
- int OrigScalarSize = VT.getScalarSizeInBits();
- for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
- if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
- return Unpack;
-
- // If we're shuffling with a zero vector then we're better off not doing
- // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
- if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
- ISD::isBuildVectorAllZeros(V2.getNode()))
- return SDValue();
-
- // If none of the unpack-rooted lowerings worked (or were profitable) try an
- // initial unpack.
- if (NumLoInputs == 0 || NumHiInputs == 0) {
- assert((NumLoInputs > 0 || NumHiInputs > 0) &&
- "We have to have *some* inputs!");
- int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
-
- // FIXME: We could consider the total complexity of the permute of each
- // possible unpacking. Or at the least we should consider how many
- // half-crossings are created.
- // FIXME: We could consider commuting the unpacks.
-
- SmallVector<int, 32> PermMask((unsigned)Size, -1);
- for (int i = 0; i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
-
- assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
-
- PermMask[i] =
- 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
- }
- return DAG.getVectorShuffle(
- VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
- DL, VT, V1, V2),
- DAG.getUNDEF(VT), PermMask);
- }
-
- return SDValue();
-}
-
/// Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
More information about the llvm-commits
mailing list