[llvm] r344446 - [X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 13 04:38:10 PDT 2018
Author: rksimon
Date: Sat Oct 13 04:38:10 2018
New Revision: 344446
URL: http://llvm.org/viewvc/llvm-project?rev=344446&view=rev
Log:
[X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161)
Add shuffle lowering for the case where we can shuffle the lanes into place followed by an in-lane permute.
This is mainly for cases where we can have non-repeating permutes in each lane, but for now I've just enabled it for v4f64 unary shuffles to fix PR39161 - there is no test coverage for other shuffles that might benefit yet.
We now have several cross-lane shuffle lowering methods that all do something similar - I've looked at merging some of these (notably by making the repeated mask mechanism in lowerVectorShuffleByMerging128BitLanes optional), but there is a lot of assertions/assumptions in the way that makes this tricky - I ended up going for adding yet another relatively simple method instead.
Differential Revision: https://reviews.llvm.org/D53148
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=344446&r1=344445&r2=344446&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Oct 13 04:38:10 2018
@@ -13431,6 +13431,60 @@ static SDValue lowerVectorShuffleAsSplit
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a lane permutation followed by a per-lane permutation.
+///
+/// This is mainly for cases where we can have non-repeating permutes
+/// in each lane.
+///
+/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// we should investigate merging them.
+static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+
+ SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
+ SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+ SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+
+ // Ensure that each lane comes from a single source lane.
+ int SrcLane = M / NumEltsPerLane;
+ int DstLane = i / NumEltsPerLane;
+ if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
+ return SDValue();
+ SrcLaneMask[DstLane] = SrcLane;
+
+ LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
+ PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
+ }
+
+ // If we're only shuffling a single lowest lane and the rest are identity
+ // then don't bother.
+ // TODO - isShuffleMaskInputInPlace could be extended to something like this.
+ int NumIdentityLanes = 0;
+ bool OnlyShuffleLowestLane = true;
+ for (int i = 0; i != NumLanes; ++i) {
+ if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
+ i * NumEltsPerLane))
+ NumIdentityLanes++;
+ else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
+ OnlyShuffleLowestLane = false;
+ }
+ if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ return SDValue();
+
+ SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
+ return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+}
+
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.
///
/// This essentially blends the out-of-lane inputs to each lane into the lane
@@ -14166,6 +14220,11 @@ static SDValue lowerV4F64VectorShuffle(c
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
+ // Try to permute the lanes and then use a per-lane permute.
+ if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+ return V;
+
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
DAG, Subtarget);
@@ -14200,6 +14259,7 @@ static SDValue lowerV4F64VectorShuffle(c
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;
+
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=344446&r1=344445&r2=344446&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Sat Oct 13 04:38:10 2018
@@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_0300(
define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_1000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_1000:
@@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2222_
define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2233:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2233:
@@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1000:
More information about the llvm-commits
mailing list