[llvm] 39a0ded - [X86] lowerShuffleAsDecomposedShuffleMerge - don't lower to unpack+permute if either source is zero.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 16 02:42:22 PDT 2024
Author: Simon Pilgrim
Date: 2024-08-16T10:42:02+01:00
New Revision: 39a0dedcbf69adaf9d4ffeebd9172844b38974c7
URL: https://github.com/llvm/llvm-project/commit/39a0dedcbf69adaf9d4ffeebd9172844b38974c7
DIFF: https://github.com/llvm/llvm-project/commit/39a0dedcbf69adaf9d4ffeebd9172844b38974c7.diff
LOG: [X86] lowerShuffleAsDecomposedShuffleMerge - don't lower to unpack+permute if either source is zero.
Fixes #104482
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 11c9a992cbdee..db50f132b1349 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11156,7 +11156,7 @@ static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
static SDValue lowerShuffleAsDecomposedShuffleMerge(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
@@ -11164,6 +11164,7 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
// Shuffle the input elements into the desired positions in V1 and V2 and
// unpack/blend them together.
bool IsAlternating = true;
+ bool V1Zero = true, V2Zero = true;
SmallVector<int, 32> V1Mask(NumElts, -1);
SmallVector<int, 32> V2Mask(NumElts, -1);
SmallVector<int, 32> FinalMask(NumElts, -1);
@@ -11172,10 +11173,12 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
if (M >= 0 && M < NumElts) {
V1Mask[i] = M;
FinalMask[i] = i;
+ V1Zero &= Zeroable[i];
IsAlternating &= (i & 1) == 0;
} else if (M >= NumElts) {
V2Mask[i] = M - NumElts;
FinalMask[i] = i + NumElts;
+ V2Zero &= Zeroable[i];
IsAlternating &= (i & 1) == 1;
}
}
@@ -11228,7 +11231,7 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
// t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
// it is better to process t4 first to create a vector of t4[0], then unpack
// that vector with t2.
- if (!isSingleElementRepeatedMask(V1Mask) &&
+ if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
!isSingleElementRepeatedMask(V2Mask))
if (SDValue UnpackPerm =
lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
@@ -12955,7 +12958,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -13274,7 +13277,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
@@ -14065,8 +14068,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends/unpacks.
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG);
}
/// Lower 8-lane 16-bit floating point shuffles.
@@ -14444,7 +14447,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Handle multi-input cases by blending/unpacking single-input shuffles.
if (NumV2Elements > 0)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -14668,6 +14671,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// results.
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
@@ -14694,8 +14698,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
return true;
};
if (DoBothBroadcast())
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -14714,8 +14718,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
// requires that the decomposed single-input shuffles don't end up here.
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG);
}
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -15907,7 +15911,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// blend the result.
if (V1IsInPlace || V2IsInPlace)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -15934,10 +15938,10 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
Subtarget, DAG);
}
@@ -16027,7 +16031,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// blend the result.
if (V1IsInPlace || V2IsInPlace)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -16051,7 +16055,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit floating point shuffles.
@@ -16162,17 +16166,17 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
+ Subtarget, DAG);
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
Subtarget, DAG);
}
@@ -16210,8 +16214,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
+ Subtarget, DAG);
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16315,7 +16319,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
- Subtarget, DAG);
+ Zeroable, Subtarget, DAG);
}
/// Handle lowering of 16-lane 16-bit integer shuffles.
@@ -16437,7 +16441,7 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Otherwise fall back on generic lowering.
- return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
Subtarget, DAG);
}
@@ -16558,7 +16562,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Otherwise fall back on generic lowering.
- return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
Subtarget, DAG);
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index fcc50041a1d82..24c05ba9c52f6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -2775,23 +2775,13 @@ entry:
define <8 x i16> @PR104482(<16 x i8> %i) {
; SSE2-LABEL: PR104482:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,0,1,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,0]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR104482:
More information about the llvm-commits
mailing list