[llvm] 34a054c - [X86] combineX86ShuffleChain - add support for combining to X86ISD::ROTLI
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 15 12:05:20 PST 2020
Author: Simon Pilgrim
Date: 2020-02-15T20:04:54Z
New Revision: 34a054ce7170060a6ec9ef4eb2dcacd69225715a
URL: https://github.com/llvm/llvm-project/commit/34a054ce7170060a6ec9ef4eb2dcacd69225715a
DIFF: https://github.com/llvm/llvm-project/commit/34a054ce7170060a6ec9ef4eb2dcacd69225715a.diff
LOG: [X86] combineX86ShuffleChain - add support for combining to X86ISD::ROTLI
Refactors matchShuffleAsBitRotate to allow use by both lowerShuffleAsBitRotate and matchUnaryPermuteShuffle.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3bc3c4a97825..c9b4b5967a59 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11704,17 +11704,34 @@ static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
return RotateAmt;
}
+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
+ const X86Subtarget &Subtarget,
+ ArrayRef<int> Mask) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
+
+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
+ int MaxSubElts = 64 / EltSizeInBits;
+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+ if (RotateAmt < 0)
+ continue;
+
+ int NumElts = Mask.size();
+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+ return RotateAmt * EltSizeInBits;
+ }
+
+ return -1;
+}
+
/// Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
-
- MVT SVT = VT.getScalarType();
- int EltSizeInBits = SVT.getScalarSizeInBits();
- assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
-
// Only XOP + AVX512 targets have bit rotation instructions.
// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
bool IsLegal =
@@ -11722,44 +11739,34 @@ static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
if (!IsLegal && Subtarget.hasSSE3())
return SDValue();
- // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
- int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
- int MaxSubElts = 64 / EltSizeInBits;
- for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
- int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
- if (RotateAmt < 0)
- continue;
+ MVT RotateVT;
+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
+ Subtarget, Mask);
+ if (RotateAmt < 0)
+ return SDValue();
- int NumElts = VT.getVectorNumElements();
- MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
- MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
-
- // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
- // expanded to OR(SRL,SHL), will be more efficient, but if they can
- // widen to vXi16 or more then existing lowering should will be better.
- int RotateAmtInBits = RotateAmt * EltSizeInBits;
- if (!IsLegal) {
- if ((RotateAmtInBits % 16) == 0)
- return SDValue();
- // TODO: Use getTargetVShiftByConstNode.
- unsigned ShlAmt = RotateAmtInBits;
- unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits;
- V1 = DAG.getBitcast(RotateVT, V1);
- SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
- DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
- SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
- DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
- SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
- return DAG.getBitcast(VT, Rot);
- }
-
- SDValue Rot =
- DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
- DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8));
+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
+ // expanded to OR(SRL,SHL), will be more efficient, but if they can
+ // widen to vXi16 or more then existing lowering should will be better.
+ if (!IsLegal) {
+ if ((RotateAmt % 16) == 0)
+ return SDValue();
+ // TODO: Use getTargetVShiftByConstNode.
+ unsigned ShlAmt = RotateAmt;
+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
+ V1 = DAG.getBitcast(RotateVT, V1);
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
return DAG.getBitcast(VT, Rot);
}
- return SDValue();
+ SDValue Rot =
+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, Rot);
}
/// Try to lower a vector shuffle as a byte rotation.
@@ -33538,6 +33545,19 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
+ // Attempt to match against bit rotates.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
+ Subtarget.hasAVX512())) {
+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
+ Subtarget, Mask);
+ if (0 < RotateAmt) {
+ Shuffle = X86ISD::VROTLI;
+ PermuteImm = (unsigned)RotateAmt;
+ return true;
+ }
+ }
+
return false;
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 7a8a7d326238..d922de739c9d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -464,10 +464,17 @@ define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
}
define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
-; CHECK-LABEL: combine_pshufb_not_as_pshufw:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
-; CHECK-NEXT: ret{{[l|q]}}
+; AVX2-LABEL: combine_pshufb_not_as_pshufw:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_pshufb_not_as_pshufw:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vprold $16, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: ret{{[l|q]}}
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
ret <32 x i8> %res1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 16a174c0b825..32709b4fd5dc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -403,10 +403,23 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_pshufb_not_as_pshufw:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_pshufb_not_as_pshufw:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_pshufb_not_as_pshufw:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: combine_pshufb_not_as_pshufw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vprold $16, %zmm0, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
ret <16 x i8> %res1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 5264ab101c4a..9c507ad5443e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -255,7 +255,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
define <16 x i8> @combine_vpperm_as_proti_v8i16(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: combine_vpperm_as_proti_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT: vprotw $8, %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>)
ret <16 x i8> %res0
More information about the llvm-commits
mailing list