[llvm] 72e242a - [X86][AVX] canonicalizeShuffleMaskWithHorizOp - improve support for 256/512-bit vectors
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed May 12 04:13:35 PDT 2021
Author: Simon Pilgrim
Date: 2021-05-12T12:13:24+01:00
New Revision: 72e242a286be1c821c521fdc8a778517b193a59e
URL: https://github.com/llvm/llvm-project/commit/72e242a286be1c821c521fdc8a778517b193a59e
DIFF: https://github.com/llvm/llvm-project/commit/72e242a286be1c821c521fdc8a778517b193a59e.diff
LOG: [X86][AVX] canonicalizeShuffleMaskWithHorizOp - improve support for 256/512-bit vectors
Extend the HOP(HOP(X,Y),HOP(Z,W)) and SHUFFLE(HOP(X,Y),HOP(Z,W)) folds to handle repeating 256/512-bit vector cases.
This allows us to drop the UNPACK(HOP(),HOP()) custom fold in combineTargetShuffle.
This required isRepeatedTargetShuffleMask to be tweaked to support target shuffle masks taking more than 2 inputs.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/horizontal-shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ddc49d6a2ab2a..463d9d12461d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10964,10 +10964,10 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
- // Ok, handle the in-lane shuffles by detecting if and when they repeat.
- // Adjust second vector indices to start at LaneSize instead of Size.
- int LocalM =
- Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
+ // later vector indices to start at multiples of LaneSize instead of Size.
+ int LaneM = Mask[i] / Size;
+ int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
@@ -36225,24 +36225,25 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
int NumEltsPerLane = NumElts / NumLanes;
int NumHalfEltsPerLane = NumEltsPerLane / 2;
MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
- // TODO: Add support for 256/512-bit vectors.
- if (RootSizeInBits == 128 && NumEltsPerLane >= 4 &&
+ if (NumEltsPerLane >= 4 &&
(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
- SmallVector<int> ScaledMask;
- if (scaleShuffleElements(Mask, 4, ScaledMask)) {
+ SmallVector<int> LaneMask, ScaledMask;
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
+ scaleShuffleElements(LaneMask, 4, ScaledMask)) {
// See if we can remove the shuffle by resorting the HOP chain so that
// the HOP args are pre-shuffled.
// TODO: Generalize to any sized/depth chain.
// TODO: Add support for PACKSS/PACKUS.
- if (isHoriz && NumEltsPerLane == 4) {
+ if (isHoriz) {
// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
auto GetHOpSrc = [&](int M) {
if (M == SM_SentinelUndef)
return DAG.getUNDEF(VT0);
if (M == SM_SentinelZero)
return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
- SDValue Src0 = BC[M / NumElts];
+ SDValue Src0 = BC[M / 4];
SDValue Src1 = Src0.getOperand((M % 4) >= 2);
if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
return Src1.getOperand(M % 2);
@@ -36253,8 +36254,8 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
SDValue M2 = GetHOpSrc(ScaledMask[2]);
SDValue M3 = GetHOpSrc(ScaledMask[3]);
if (M0 && M1 && M2 && M3) {
- SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
- SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
+ SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
+ SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
}
}
@@ -36348,7 +36349,6 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
// represents the LHS/RHS inputs for the lower/upper halves.
- unsigned EltSizeInBits = RootSizeInBits / Mask.size();
SmallVector<int, 16> TargetMask128, WideMask128;
if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
scaleShuffleElements(TargetMask128, 2, WideMask128)) {
@@ -37564,29 +37564,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH: {
- // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
- // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
- // Don't fold if hop(x,y) == hop(z,w).
- // TODO: Merge this into canonicalizeShuffleMaskWithHorizOp?
- SDValue N0 = N.getOperand(0);
- SDValue N1 = N.getOperand(1);
- if (VT.getScalarSizeInBits() == 32 && N0 != N1 &&
- N0.getOpcode() == N1.getOpcode() && isHorizOp(N0.getOpcode())) {
- unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
- SDValue Res = DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(LoHi),
- N1.getOperand(LoHi));
- // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
- // combining and domain handling will simplify this later on.
- EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
- Res = DAG.getBitcast(ShuffleVT, Res);
- Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
- getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
- return DAG.getBitcast(VT, Res);
- }
- return SDValue();
- }
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle.ll b/llvm/test/CodeGen/X86/horizontal-shuffle.ll
index ba9761d4e9883..9879fa6899ae1 100644
--- a/llvm/test/CodeGen/X86/horizontal-shuffle.ll
+++ b/llvm/test/CodeGen/X86/horizontal-shuffle.ll
@@ -305,9 +305,8 @@ define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x
define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
; CHECK-LABEL: test_shufps_packss_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; CHECK-NEXT: vpackssdw %ymm3, %ymm0, %ymm1
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,2],ymm0[4,5],ymm1[6,6]
+; CHECK-NEXT: vpackssdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
; CHECK-NEXT: ret{{[l|q]}}
%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
%2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
@@ -320,9 +319,8 @@ define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
define <8 x float> @test_shufps_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
; CHECK-LABEL: test_shufps_packus_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
-; CHECK-NEXT: vpackuswb %ymm0, %ymm2, %ymm1
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4]
+; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
; CHECK-NEXT: ret{{[l|q]}}
%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
%2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a2, <16 x i16> %a3)
More information about the llvm-commits
mailing list