[llvm] r324709 - [X86] Teach shuffle lowering to recognize 128/256 bit insertions into a zero vector.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 21:54:34 PST 2018
Author: ctopper
Date: Thu Feb 8 21:54:34 2018
New Revision: 324709
URL: http://llvm.org/viewvc/llvm-project?rev=324709&view=rev
Log:
[X86] Teach shuffle lowering to recognize 128/256 bit insertions into a zero vector.
This regresses a couple cases in the shuffle combining test. But those cases use intrinsics that InstCombine knows how to turn into a generic shuffle earlier. This should give opportunities to fold this earlier in InstCombine or DAG combine.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll
llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=324709&r1=324708&r2=324709&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Feb 8 21:54:34 2018
@@ -12727,6 +12727,19 @@ static SDValue lowerV2X128VectorShuffle(
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ bool IsLowZero = (Zeroable & 0x3) == 0x3;
+ bool IsHighZero = (Zeroable & 0xc) == 0xc;
+
+ // Try to use an insert into a zero vector.
+ if (WidenedMask[0] == 0 && IsHighZero) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
@@ -12736,9 +12749,6 @@ static SDValue lowerV2X128VectorShuffle(
Zeroable, Subtarget, DAG))
return Blend;
- bool IsLowZero = (Zeroable & 0x3) == 0x3;
- bool IsHighZero = (Zeroable & 0xc) == 0xc;
-
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
if (!IsLowZero && !IsHighZero) {
@@ -12750,8 +12760,7 @@ static SDValue lowerV2X128VectorShuffle(
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0, DL));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
@@ -13886,8 +13895,11 @@ static SDValue lower256BitVectorShuffle(
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+ ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
@@ -13899,6 +13911,18 @@ static SDValue lowerV4X128VectorShuffle(
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ // Try to use an insert into a zero vector.
+ if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+ (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+ unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
@@ -14004,7 +14028,8 @@ static SDValue lowerV8F64VectorShuffle(c
}
if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
+ Subtarget, DAG))
return Shuf128;
if (SDValue Unpck =
@@ -14114,7 +14139,8 @@ static SDValue lowerV8I64VectorShuffle(c
}
if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
+ V1, V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
Modified: llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll?rev=324709&r1=324708&r2=324709&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll Thu Feb 8 21:54:34 2018
@@ -435,8 +435,7 @@ define <4 x double> @shuffle_v4f64_zz67_
define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_01zz:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vmovaps %xmm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %s
@@ -444,8 +443,7 @@ define <4 x double> @shuffle_v4f64_01zz(
define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_01zz_optsize:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vmovaps %xmm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %s
@@ -471,8 +469,7 @@ define <4 x double> @shuffle_v4f64_23zz_
define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_45zz:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vmovaps %xmm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x double> %s
@@ -480,8 +477,7 @@ define <4 x double> @shuffle_v4f64_45zz(
define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_45zz_optsize:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vmovaps %xmm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x double> %s
Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=324709&r1=324708&r2=324709&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Thu Feb 8 21:54:34 2018
@@ -2759,8 +2759,7 @@ define <8 x i64> @mask_widening(<2 x i64
; KNL-NEXT: kshiftlw $12, %k0, %k0
; KNL-NEXT: kshiftrw $12, %k0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15]
+; KNL-NEXT: vmovdqa %ymm0, %ymm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; KNL-NEXT: retq
@@ -2769,8 +2768,7 @@ define <8 x i64> @mask_widening(<2 x i64
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15]
+; SKX-NEXT: vmovdqa %ymm0, %ymm0
; SKX-NEXT: vpmovd2m %zmm0, %k1
; SKX-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -2783,8 +2781,7 @@ define <8 x i64> @mask_widening(<2 x i64
; AVX512BW-NEXT: kshiftlw $12, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; AVX512BW-NEXT: retq
@@ -2797,8 +2794,7 @@ define <8 x i64> @mask_widening(<2 x i64
; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0
; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll?rev=324709&r1=324708&r2=324709&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll Thu Feb 8 21:54:34 2018
@@ -2783,14 +2783,12 @@ define <8 x i64> @test_v8i64_insert_zero
define <8 x i64> @test_v8i64_insert_zero_256(<8 x i64> %a) {
; AVX512F-LABEL: test_v8i64_insert_zero_256:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8i64_insert_zero_256:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; AVX512F-32-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %a, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
ret <8 x i64> %res
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll?rev=324709&r1=324708&r2=324709&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll Thu Feb 8 21:54:34 2018
@@ -203,14 +203,16 @@ define <8 x float> @combine_vpermilvar_v
define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
; X32-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
; X32: # %bb.0:
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X32-NEXT: vmovapd %xmm0, %xmm0
+; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X32-NEXT: retl
;
; X64-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X64-NEXT: vmovapd %xmm0, %xmm0
+; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X64-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
%2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
More information about the llvm-commits
mailing list