[llvm] r278787 - [X86][SSE] Add support for combining target shuffles to PALIGNR byte rotations
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 16 03:03:23 PDT 2016
Author: rksimon
Date: Tue Aug 16 05:03:23 2016
New Revision: 278787
URL: http://llvm.org/viewvc/llvm-project?rev=278787&view=rev
Log:
[X86][SSE] Add support for combining target shuffles to PALIGNR byte rotations
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=278787&r1=278786&r2=278787&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Aug 16 05:03:23 2016
@@ -7747,13 +7747,8 @@ static SDValue lowerVectorShuffleAsDecom
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
-
+static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
@@ -7769,20 +7764,28 @@ static SDValue lowerVectorShuffleAsByteR
SDValue Lo, Hi;
for (int l = 0; l < NumElts; l += NumLaneElts) {
for (int i = 0; i < NumLaneElts; ++i) {
- if (Mask[l + i] < 0)
+ int M = Mask[l + i];
+
+ if (M == SM_SentinelUndef)
continue;
+ if (M == SM_SentinelZero)
+ return -1;
+
+ assert(0 <= M && M < (2*NumElts) && "Unexpected mask index.");
+
// Get the mod-Size index and lane correct it.
- int LaneIdx = (Mask[l + i] % NumElts) - l;
+ int LaneIdx = (M % NumElts) - l;
+
// Make sure it was in this lane.
if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
- return SDValue();
+ return -1;
// Determine where a rotated vector would have started.
int StartIdx = i - LaneIdx;
if (StartIdx == 0)
// The identity rotation isn't interesting, stop.
- return SDValue();
+ return -1;
// If we found the tail of a vector the rotation must be the missing
// front. If we found the head of a vector, it must be how much of the
@@ -7793,10 +7796,10 @@ static SDValue lowerVectorShuffleAsByteR
Rotation = CandidateRotation;
else if (Rotation != CandidateRotation)
// The rotations don't match, so we can't match this mask.
- return SDValue();
+ return -1;
// Compute which value this mask is pointing at.
- SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+ SDValue MaskV = M < NumElts ? V1 : V2;
// Compute which of the two target values this index should be assigned
// to. This reflects whether the high elements are remaining or the low
@@ -7810,7 +7813,7 @@ static SDValue lowerVectorShuffleAsByteR
else if (TargetV != MaskV)
// This may be a rotation, but it pulls from the inputs in some
// unsupported interleaving.
- return SDValue();
+ return -1;
}
}
@@ -7822,15 +7825,32 @@ static SDValue lowerVectorShuffleAsByteR
else if (!Hi)
Hi = Lo;
- // Cast the inputs to i8 vector of correct length to match PALIGNR or
- // PSLLDQ/PSRLDQ.
- MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
- Lo = DAG.getBitcast(ByteVT, Lo);
- Hi = DAG.getBitcast(ByteVT, Hi);
+ V1 = Lo;
+ V2 = Hi;
// The actual rotate instruction rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector lane.
int Scale = 16 / NumLaneElts;
+ return Rotation * Scale;
+}
+
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SDValue Lo = V1, Hi = V2;
+ int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ if (ByteRotation <= 0)
+ return SDValue();
+
+ // Cast the inputs to i8 vector of correct length to match PALIGNR or
+ // PSLLDQ/PSRLDQ.
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ Lo = DAG.getBitcast(ByteVT, Lo);
+ Hi = DAG.getBitcast(ByteVT, Hi);
// SSSE3 targets can use the palignr instruction.
if (Subtarget.hasSSSE3()) {
@@ -7838,7 +7858,7 @@ static SDValue lowerVectorShuffleAsByteR
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
- DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
+ DAG.getConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
@@ -7849,8 +7869,8 @@ static SDValue lowerVectorShuffleAsByteR
"SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
- int LoByteShift = 16 - Rotation * Scale;
- int HiByteShift = Rotation * Scale;
+ int LoByteShift = 16 - ByteRotation;
+ int HiByteShift = ByteRotation;
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getConstant(LoByteShift, DL, MVT::i8));
@@ -25198,6 +25218,19 @@ static bool matchBinaryPermuteVectorShuf
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
+ bool FloatDomain = MaskVT.isFloatingPoint();
+
+ // Attempt to match against PALIGNR byte rotate.
+ if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ if (0 < ByteRotation) {
+ Shuffle = X86ISD::PALIGNR;
+ ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
+ PermuteImm = ByteRotation;
+ return true;
+ }
+ }
// Attempt to blend with zero.
if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll?rev=278787&r1=278786&r2=278787&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll Tue Aug 16 05:03:23 2016
@@ -2971,9 +2971,7 @@ define <16 x i16> @shuffle_v16i16_21_22_
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3215,9 +3213,7 @@ define <16 x i16> @shuffle_v16i16_05_06_
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=278787&r1=278786&r2=278787&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Tue Aug 16 05:03:23 2016
@@ -576,7 +576,7 @@ define <4 x i64> @shuffle_v4i64_0112(<4
; AVX1-LABEL: shuffle_v4i64_0112:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -748,7 +748,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4
; AVX1-LABEL: shuffle_v4i64_0412:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
@@ -775,7 +775,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4
; AVX1-LABEL: shuffle_v4i64_4012:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll?rev=278787&r1=278786&r2=278787&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll Tue Aug 16 05:03:23 2016
@@ -184,6 +184,20 @@ define <16 x i8> @combine_pshufb_psrldq(
ret <16 x i8> %2
}
+define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_palignr:
+; SSE: # BB#0:
+; SSE-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_palignr:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>)
+ ret <16 x i8> %res0
+}
+
define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pslldq:
; SSE: # BB#0:
More information about the llvm-commits
mailing list