[llvm] r288006 - [X86][SSE] Add support for combining target shuffles to 128/256-bit PSLL/PSRL bit shifts
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 27 13:08:19 PST 2016
Author: rksimon
Date: Sun Nov 27 15:08:19 2016
New Revision: 288006
URL: http://llvm.org/viewvc/llvm-project?rev=288006&view=rev
Log:
[X86][SSE] Add support for combining target shuffles to 128/256-bit PSLL/PSRL bit shifts
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=288006&r1=288005&r2=288006&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Nov 27 15:08:19 2016
@@ -25480,63 +25480,36 @@ static bool matchUnaryPermuteVectorShuff
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
- unsigned NumLanes = MaskVT.getSizeInBits() / 128;
- unsigned NumEltsPerLane = NumMaskElts / NumLanes;
bool FloatDomain = MaskVT.isFloatingPoint();
- // Attempt to match against PSLLDQ/PSRLDQ byte shifts.
- // TODO: Share common code with lowerVectorShuffleAsShift?
- //
- // PSLLDQ : (little-endian) left byte shift
- // [ zz, 0, 1, 2, 3, 4, 5, 6]
- // [ zz, zz, -1, -1, 2, 3, 4, -1]
- // [ zz, zz, zz, zz, zz, zz, -1, 1]
- // PSRLDQ : (little-endian) right byte shift
- // [ 5, 6, 7, zz, zz, zz, zz, zz]
- // [ -1, 5, 6, 7, zz, zz, zz, zz]
- // [ 1, 2, -1, -1, -1, -1, zz, zz]
+ bool ContainsZeros = false;
+ SmallBitVector Zeroable(NumMaskElts, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ Zeroable[i] = isUndefOrZero(M);
+ ContainsZeros |= (M == SM_SentinelZero);
+ }
+
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- for (unsigned Shift = 1; Shift != NumEltsPerLane; ++Shift) {
- bool IsVSHLDQ = true;
- bool IsVSRLDQ = true;
-
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- unsigned Base = Lane * NumEltsPerLane;
- unsigned Ofs = NumEltsPerLane - Shift;
-
- IsVSHLDQ &= isUndefOrZeroInRange(Mask, Base, Shift);
- IsVSHLDQ &= isSequentialOrUndefInRange(Mask, Base + Shift, Ofs, Base);
-
- IsVSRLDQ &= isUndefOrZeroInRange(Mask, Base + Ofs, Shift);
- IsVSRLDQ &= isSequentialOrUndefInRange(Mask, Base, Ofs, Base + Shift);
-
- if (!IsVSHLDQ && !IsVSRLDQ)
- break;
- }
-
- if (IsVSHLDQ) {
- Shuffle = X86ISD::VSHLDQ;
- ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16);
- PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8);
- return true;
- }
- if (IsVSRLDQ) {
- Shuffle = X86ISD::VSRLDQ;
- ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16);
- PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8);
- return true;
- }
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskVT.getScalarSizeInBits(), Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
+ return true;
}
}
// Ensure we don't contain any zero elements.
- for (int M : Mask) {
- if (M == SM_SentinelZero)
- return false;
- assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
- "Expected unary shuffle");
- }
+ if (ContainsZeros)
+ return false;
+
+ assert(llvm::all_of(Mask, [&](int M) {
+ return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+ }) && "Expected unary shuffle");
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll?rev=288006&r1=288005&r2=288006&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll Sun Nov 27 15:08:19 2016
@@ -514,12 +514,12 @@ define <32 x i8> @combine_pshufb_as_psrl
define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrlw:
; X32: # BB#0:
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero
+; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrlw:
; X64: # BB#0:
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero
+; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
ret <32 x i8> %res0
@@ -528,12 +528,12 @@ define <32 x i8> @combine_pshufb_as_psrl
define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pslld:
; X32: # BB#0:
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28]
+; X32-NEXT: vpslld $24, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pslld:
; X64: # BB#0:
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28]
+; X64-NEXT: vpslld $24, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
ret <32 x i8> %res0
@@ -542,12 +542,12 @@ define <32 x i8> @combine_pshufb_as_psll
define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrlq:
; X32: # BB#0:
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero
+; X32-NEXT: vpsrlq $40, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrlq:
; X64: # BB#0:
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero
+; X64-NEXT: vpsrlq $40, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
ret <32 x i8> %res0
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll?rev=288006&r1=288005&r2=288006&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll Sun Nov 27 15:08:19 2016
@@ -299,12 +299,12 @@ define <16 x i8> @combine_pshufb_as_psrl
define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_psrlw:
; SSE: # BB#0:
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero
+; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_psrlw:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>)
ret <16 x i8> %res0
@@ -313,12 +313,12 @@ define <16 x i8> @combine_pshufb_as_psrl
define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pslld:
; SSE: # BB#0:
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12]
+; SSE-NEXT: pslld $24, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_pslld:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12]
+; AVX-NEXT: vpslld $24, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>)
ret <16 x i8> %res0
@@ -327,12 +327,12 @@ define <16 x i8> @combine_pshufb_as_psll
define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_psrlq:
; SSE: # BB#0:
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero
+; SSE-NEXT: psrlq $40, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_psrlq:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero
+; AVX-NEXT: vpsrlq $40, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>)
ret <16 x i8> %res0
More information about the llvm-commits
mailing list