[llvm] r359292 - [X86][AVX] Combine shuffles extracted from a common vector
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 26 02:56:14 PDT 2019
Author: rksimon
Date: Fri Apr 26 02:56:14 2019
New Revision: 359292
URL: http://llvm.org/viewvc/llvm-project?rev=359292&view=rev
Log:
[X86][AVX] Combine shuffles extracted from a common vector
A small step towards combining shuffles across vector sizes - this recognizes when a shuffle's operands are all extracted from the same larger source and tries to combine to an unary shuffle of that source instead. Fixes one of the test cases from PR34380.
Differential Revision: https://reviews.llvm.org/D60512
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=359292&r1=359291&r2=359292&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Apr 26 02:56:14 2019
@@ -31772,6 +31772,51 @@ static SDValue combineX86ShuffleChain(Ar
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and both inputs are extracted from the same source then
+ // try to combine as an unary shuffle with the larger type.
+ if (!UnaryShuffle && V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(V1.getOperand(1)) &&
+ isa<ConstantSDNode>(V2.getOperand(1))) {
+ SDValue Src1 = V1.getOperand(0);
+ SDValue Src2 = V2.getOperand(0);
+ if (Src1 == Src2) {
+ unsigned Offset1 = V1.getConstantOperandVal(1);
+ unsigned Offset2 = V2.getConstantOperandVal(1);
+ assert(((Offset1 % VT1.getVectorNumElements()) == 0 ||
+ (Offset2 % VT2.getVectorNumElements()) == 0 ||
+ (Src1.getValueSizeInBits() % RootSizeInBits) == 0) &&
+ "Unexpected subvector extraction");
+ // Convert extraction indices to mask size.
+ Offset1 /= VT1.getVectorNumElements();
+ Offset2 /= VT2.getVectorNumElements();
+ Offset1 *= NumMaskElts;
+ Offset2 *= NumMaskElts;
+
+ // Create new mask for larger type.
+ SmallVector<int, 64> NewMask(Mask);
+ for (int &M : NewMask) {
+ if (M < 0)
+ continue;
+ if (M < (int)NumMaskElts)
+ M += Offset1;
+ else
+ M = (M - NumMaskElts) + Offset2;
+ }
+ unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
+ NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
+
+ SDValue NewInputs[] = {Src1};
+ if (SDValue Res = combineX86ShuffleChain(
+ NewInputs, Src1, NewMask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget)) {
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT1, Res,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ }
+
// Failed to find any combines.
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=359292&r1=359291&r2=359292&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Fri Apr 26 02:56:14 2019
@@ -6,10 +6,10 @@
define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
@@ -18,10 +18,9 @@ define <8 x i16> @test_16xi16_to_8xi16_p
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
@@ -35,10 +34,9 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
@@ -51,10 +49,9 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
@@ -68,10 +65,9 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
@@ -84,10 +80,9 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
@@ -101,10 +96,9 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
@@ -117,10 +111,10 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -129,10 +123,9 @@ define <8 x i16> @test_16xi16_to_8xi16_p
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
@@ -146,10 +139,9 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
@@ -922,10 +914,10 @@ define <8 x i16> @test_masked_z_32xi16_t
define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,8,9,8,9,2,3,2,3,12,13,14,15]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
@@ -2232,9 +2224,8 @@ define <4 x i64> @test_masked_z_8xi64_to
define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2243,11 +2234,9 @@ define <2 x i64> @test_8xi64_to_2xi64_pe
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm3[0],xmm0[0]
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2259,10 +2248,9 @@ define <2 x i64> @test_masked_8xi64_to_2
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm0[0]
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -4432,13 +4420,17 @@ define <2 x double> @test_masked_z_8xdou
ret <2 x double> %res
}
+; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %xmm2
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,4,1,4,1,4,1,4]
+; CHECK-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4447,13 +4439,17 @@ define <2 x double> @test_masked_8xdoubl
ret <2 x double> %res
}
+; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %xmm1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,4,1,4,1,4,1,4]
+; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll?rev=359292&r1=359291&r2=359292&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll Fri Apr 26 02:56:14 2019
@@ -363,10 +363,10 @@ define <8 x float> @test_v16f32_0_1_2_3_
define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_3_6:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,3,3]
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,3,6,0,1,3,6,0,1,3,6,0,1,3,6]
+; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = shufflevector <16 x float> %v, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 6>
More information about the llvm-commits
mailing list