[llvm] c769ba9 - [X86][AVX] combineHorizOpWithShuffle - improve SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))) folding
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 26 10:24:24 PDT 2021
Author: Simon Pilgrim
Date: 2021-03-26T17:23:54Z
New Revision: c769ba9514c3f82578513b730eda8d49ce257e23
URL: https://github.com/llvm/llvm-project/commit/c769ba9514c3f82578513b730eda8d49ce257e23
DIFF: https://github.com/llvm/llvm-project/commit/c769ba9514c3f82578513b730eda8d49ce257e23.diff
LOG: [X86][AVX] combineHorizOpWithShuffle - improve SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))) folding
Peek through bitcasts to find subvector splits and use getTargetShuffleInputs to decode target shuffles as well as ShuffleVectorSDNode
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c8096f7e1a608..74322f68912da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43213,35 +43213,41 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
EVT SrcVT = N0.getValueType();
+ SDValue BC0 = peekThroughBitcasts(N0);
+ SDValue BC1 = peekThroughBitcasts(N1);
+
// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
// truncation trees that help us avoid lane crossing shuffles.
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
// TODO: We don't handle vXf64 shuffles yet.
- if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N0.getConstantOperandAPInt(1) == 0 &&
- N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
- N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
- N0.getOperand(0).getValueType().is256BitVector() &&
- SrcVT.getScalarSizeInBits() <= 32) {
- // TODO - support target/faux shuffles.
- SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
- if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
+ BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ BC0.getOperand(0) == BC1.getOperand(0) &&
+ BC0.getOperand(0).getValueType().is256BitVector() &&
+ BC0.getConstantOperandAPInt(1) == 0 &&
+ BC1.getConstantOperandAPInt(1) ==
+ BC0.getValueType().getVectorNumElements()) {
+ SmallVector<SDValue> ShuffleOps;
+ SmallVector<int> ShuffleMask, ScaledMask;
+ SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
+ if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
+ resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
// To keep the HOP LHS/RHS coherency, we must be able to scale the unary
- // shuffle to a vXi64 width - we can probably relax this in the future.
- SmallVector<int, 4> ShuffleMask;
- if (SVN->getOperand(1).isUndef() &&
- scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+ // shuffle to a v4X64 width - we can probably relax this in the future.
+ if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
+ ShuffleOps[0].getValueType().is256BitVector() &&
+ scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
SDLoc DL(N);
SDValue Lo, Hi;
MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
- std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
- Lo = DAG.getBitcast(N0.getValueType(), Lo);
- Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
+ Lo = DAG.getBitcast(SrcVT, Lo);
+ Hi = DAG.getBitcast(SrcVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
return DAG.getBitcast(VT, Res);
}
}
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f561add083b2a..53873481a30e5 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -846,9 +846,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 55df7b78745e7..9bf23917b375f 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -1333,9 +1333,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index bbe832d93d916..21ad6259a4633 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -1137,9 +1137,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 30867e21914bf..a3146c58eac18 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -3871,9 +3871,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64>* %p0) "min-legal-vector-width
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -4274,9 +4274,9 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) "min-l
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 442d903fe039a..34f05c161a082 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -3639,9 +3639,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64>* %p0) "min-legal-vector-width"=
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -4060,9 +4060,9 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) "min-leg
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 54d29c644a425..720e8185b1ea7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -2812,9 +2812,9 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64>* %p0) {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -3073,9 +3073,9 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 77a81e8f0161a..c2b1bf1f04c93 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -273,9 +273,9 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
More information about the llvm-commits
mailing list